shithub: libvpx

--- a/build/make/Makefile

+++ b/build/make/Makefile

@@ -103,6 +103,18 @@

 .PHONY: testdata

 testdata::

+# Add compiler flags for intrinsic files

+$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx

+$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx

+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2

+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2

+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3

+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3

+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3

+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3

+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1

+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1

 $(BUILD_PFX)%.c.d: %.c

 	$(if $(quiet),@echo "    [DEP] $@")

 	$(qexec)mkdir -p $(dir $@)

--- a/build/make/configure.sh

+++ b/build/make/configure.sh

@@ -266,12 +266,13 @@

fi

 TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"

 TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"

+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"

 TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"

 TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"

 TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"

 clean_temp_files() {

-    rm -f ${TMP_C} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}

+    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}

@@ -292,9 +293,9 @@

 check_cxx() {

     log check_cxx "$@"

-    cat >${TMP_C}

-    log_file ${TMP_C}

-    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}

+    cat >${TMP_CC}

+    log_file ${TMP_CC}

+    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}

 check_cpp() {

@@ -1071,7 +1072,7 @@

                 tune_cflags="-march="

                 setup_gnu_toolchain

                 #for 32 bit x86 builds, -O3 did not turn on this flag

-                enabled optimizations && check_add_cflags -fomit-frame-pointer

+                enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer

;;

             vs*)

                 # When building with Microsoft Visual Studio the assembler is

--- a/configure

+++ b/configure

@@ -243,19 +243,11 @@

     unistd_h

 EXPERIMENT_LIST="

-    csm

-    new_mvref

-    implicit_segmentation

-    newbintramodes

-    comp_interintra_pred

-    enable_6tap

-    abovesprefmv

-    code_nonzerocount

-    useselectrefmv

-    modelcoefprob

-    loop_dering

-    implicit_compoundinter_weight

-    scatterscan

+    oneshotq

+    multiple_arf

+    non420

+    alpha

+    balanced_coeftree

 CONFIG_LIST="

     external_build

@@ -608,7 +600,10 @@

         check_add_cflags -Wimplicit-function-declaration

         check_add_cflags -Wuninitialized

         check_add_cflags -Wunused-variable

-        check_add_cflags -Wunused-but-set-variable

+        case ${CC} in

+          *clang*) ;;

+          *) check_add_cflags -Wunused-but-set-variable ;;

+        esac

         enabled extra_warnings || check_add_cflags -Wno-unused-function

fi

--- a/test/acm_random.h

+++ b/test/acm_random.h

@@ -34,6 +34,13 @@

     return (value >> 24) & 0xff;

+  uint8_t Rand8Extremes(void) {

+    // Returns a random value near 0 or near 255, to better exercise

+    // saturation behavior.

+    const uint8_t r = Rand8();

+    return r < 128 ? r << 4 : r >> 4;

+  }

   int PseudoUniform(int range) {

     return random_.Generate(range);

--- /dev/null

+++ b/test/borders_test.cc

@@ -1,0 +1,86 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <climits>

+#include <vector>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

+#include "test/encode_test_driver.h"

+#include "test/i420_video_source.h"

+#include "test/util.h"

+namespace {

+class BordersTest : public ::libvpx_test::EncoderTest,

+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {

+ protected:

+  BordersTest() : EncoderTest(GET_PARAM(0)) {}

+  virtual void SetUp() {

+    InitializeConfig();

+    SetMode(GET_PARAM(1));

+  }

+  virtual bool Continue() const {

+    return !HasFatalFailure() && !abort_;

+  }

+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,

+                                  ::libvpx_test::Encoder *encoder) {

+    if ( video->frame() == 1) {

+      encoder->Control(VP8E_SET_CPUUSED, 0);

+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);

+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);

+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);

+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);

+    }

+  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {

+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {

+    }

+  }

+};

+TEST_P(BordersTest, TestEncodeHighBitrate) {

+  // Validate that this non multiple of 64 wide clip encodes and decodes

+  // without a mismatch when passing in a very low max q.  This pushes

+  // the encoder to producing lots of big partitions which will likely

+  // extend into the border and test the border condition.

+  cfg_.g_lag_in_frames = 25;

+  cfg_.rc_2pass_vbr_minsection_pct = 5;

+  cfg_.rc_2pass_vbr_minsection_pct = 2000;

+  cfg_.rc_target_bitrate = 2000;

+  cfg_.rc_max_quantizer = 10;

+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,

+                                       40);

+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

+}

+TEST_P(BordersTest, TestLowBitrate) {

+  // Validate that this clip encodes and decodes without a mismatch

+  // when passing in a very high min q.  This pushes the encoder to producing

+  // lots of small partitions which might will test the other condition.

+  cfg_.g_lag_in_frames = 25;

+  cfg_.rc_2pass_vbr_minsection_pct = 5;

+  cfg_.rc_2pass_vbr_minsection_pct = 2000;

+  cfg_.rc_target_bitrate = 200;

+  cfg_.rc_min_quantizer = 40;

+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,

+                                       40);

+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

+}

+VP9_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(

+    ::libvpx_test::kTwoPassGood));

+}  // namespace

--- a/test/convolve_test.cc

+++ b/test/convolve_test.cc

@@ -8,6 +8,10 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include "test/acm_random.h"

+#include "test/register_state_check.h"

+#include "test/util.h"

+#include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {

 #include "./vpx_config.h"

@@ -16,10 +20,6 @@

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_ports/mem.h"

-#include "third_party/googletest/src/include/gtest/gtest.h"

-#include "test/acm_random.h"

-#include "test/register_state_check.h"

-#include "test/util.h"

 namespace {

 typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,

@@ -46,20 +46,20 @@

 // Reference 8-tap subpixel filter, slightly modified to fit into this test.

 #define VP9_FILTER_WEIGHT 128

 #define VP9_FILTER_SHIFT 7

-static uint8_t clip_pixel(int x) {

+uint8_t clip_pixel(int x) {

   return x < 0 ? 0 :

          x > 255 ? 255 :

x;

-static void filter_block2d_8_c(const uint8_t *src_ptr,

-                               const unsigned int src_stride,

-                               const int16_t *HFilter,

-                               const int16_t *VFilter,

-                               uint8_t *dst_ptr,

-                               unsigned int dst_stride,

-                               unsigned int output_width,

-                               unsigned int output_height) {

+void filter_block2d_8_c(const uint8_t *src_ptr,

+                        const unsigned int src_stride,

+                        const int16_t *HFilter,

+                        const int16_t *VFilter,

+                        uint8_t *dst_ptr,

+                        unsigned int dst_stride,

+                        unsigned int output_width,

+                        unsigned int output_height) {

   // Between passes, we use an intermediate buffer whose height is extended to

   // have enough horizontally filtered values as input for the vertical pass.

   // This buffer is allocated to be big enough for the largest block type we

@@ -66,7 +66,7 @@

   // support.

   const int kInterp_Extend = 4;

   const unsigned int intermediate_height =

-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;

+      (kInterp_Extend - 1) + output_height + kInterp_Extend;

   /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,

    * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height

@@ -75,7 +75,7 @@

    *                               = 23

    * and filter_max_width = 16

*/

-  uint8_t intermediate_buffer[23 * 16];

+  uint8_t intermediate_buffer[71 * 64];

   const int intermediate_next_stride = 1 - intermediate_height * output_width;

   // Horizontal pass (src -> transposed intermediate).

@@ -87,15 +87,15 @@

     for (i = 0; i < intermediate_height; ++i) {

       for (j = 0; j < output_width; ++j) {

         // Apply filter...

-        int temp = ((int)src_ptr[0] * HFilter[0]) +

-                   ((int)src_ptr[1] * HFilter[1]) +

-                   ((int)src_ptr[2] * HFilter[2]) +

-                   ((int)src_ptr[3] * HFilter[3]) +

-                   ((int)src_ptr[4] * HFilter[4]) +

-                   ((int)src_ptr[5] * HFilter[5]) +

-                   ((int)src_ptr[6] * HFilter[6]) +

-                   ((int)src_ptr[7] * HFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding

+        const int temp = (src_ptr[0] * HFilter[0]) +

+                         (src_ptr[1] * HFilter[1]) +

+                         (src_ptr[2] * HFilter[2]) +

+                         (src_ptr[3] * HFilter[3]) +

+                         (src_ptr[4] * HFilter[4]) +

+                         (src_ptr[5] * HFilter[5]) +

+                         (src_ptr[6] * HFilter[6]) +

+                         (src_ptr[7] * HFilter[7]) +

+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding

         // Normalize back to 0-255...

         *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);

@@ -115,15 +115,15 @@

     for (i = 0; i < output_height; ++i) {

       for (j = 0; j < output_width; ++j) {

         // Apply filter...

-        int temp = ((int)src_ptr[0] * VFilter[0]) +

-                   ((int)src_ptr[1] * VFilter[1]) +

-                   ((int)src_ptr[2] * VFilter[2]) +

-                   ((int)src_ptr[3] * VFilter[3]) +

-                   ((int)src_ptr[4] * VFilter[4]) +

-                   ((int)src_ptr[5] * VFilter[5]) +

-                   ((int)src_ptr[6] * VFilter[6]) +

-                   ((int)src_ptr[7] * VFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding

+        const int temp = (src_ptr[0] * VFilter[0]) +

+                         (src_ptr[1] * VFilter[1]) +

+                         (src_ptr[2] * VFilter[2]) +

+                         (src_ptr[3] * VFilter[3]) +

+                         (src_ptr[4] * VFilter[4]) +

+                         (src_ptr[5] * VFilter[5]) +

+                         (src_ptr[6] * VFilter[6]) +

+                         (src_ptr[7] * VFilter[7]) +

+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding

         // Normalize back to 0-255...

         *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);

@@ -135,12 +135,12 @@

-static void block2d_average_c(uint8_t *src,

-                              unsigned int src_stride,

-                              uint8_t *output_ptr,

-                              unsigned int output_stride,

-                              unsigned int output_width,

-                              unsigned int output_height) {

+void block2d_average_c(uint8_t *src,

+                       unsigned int src_stride,

+                       uint8_t *output_ptr,

+                       unsigned int output_stride,

+                       unsigned int output_width,

+                       unsigned int output_height) {

   unsigned int i, j;

   for (i = 0; i < output_height; ++i) {

     for (j = 0; j < output_width; ++j) {

@@ -150,21 +150,21 @@

-static void filter_average_block2d_8_c(const uint8_t *src_ptr,

-                                       const unsigned int src_stride,

-                                       const int16_t *HFilter,

-                                       const int16_t *VFilter,

-                                       uint8_t *dst_ptr,

-                                       unsigned int dst_stride,

-                                       unsigned int output_width,

-                                       unsigned int output_height) {

-  uint8_t tmp[16*16];

+void filter_average_block2d_8_c(const uint8_t *src_ptr,

+                                const unsigned int src_stride,

+                                const int16_t *HFilter,

+                                const int16_t *VFilter,

+                                uint8_t *dst_ptr,

+                                unsigned int dst_stride,

+                                unsigned int output_width,

+                                unsigned int output_height) {

+  uint8_t tmp[64 * 64];

-  assert(output_width <= 16);

-  assert(output_height <= 16);

-  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 16,

+  assert(output_width <= 64);

+  assert(output_height <= 64);

+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,

                      output_width, output_height);

-  block2d_average_c(tmp, 16, dst_ptr, dst_stride,

+  block2d_average_c(tmp, 64, dst_ptr, dst_stride,

                     output_width, output_height);

@@ -173,10 +173,9 @@

   static void SetUpTestCase() {

     // Force input_ to be unaligned, output to be 16 byte aligned.

     input_ = reinterpret_cast<uint8_t*>(

-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))

-        + 1;

+        vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;

     output_ = reinterpret_cast<uint8_t*>(

-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));

+        vpx_memalign(kDataAlignment, kOutputBufferSize));

   static void TearDownTestCase() {

@@ -186,62 +185,63 @@

     output_ = NULL;

-  protected:

-    static const int kDataAlignment = 16;

-    static const int kOuterBlockSize = 32;

-    static const int kInputStride = kOuterBlockSize;

-    static const int kOutputStride = kOuterBlockSize;

-    static const int kMaxDimension = 16;

+ protected:

+  static const int kDataAlignment = 16;

+  static const int kOuterBlockSize = 128;

+  static const int kInputStride = kOuterBlockSize;

+  static const int kOutputStride = kOuterBlockSize;

+  static const int kMaxDimension = 64;

+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;

+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;

-    int Width() const { return GET_PARAM(0); }

-    int Height() const { return GET_PARAM(1); }

-    int BorderLeft() const {

-      const int center = (kOuterBlockSize - Width()) / 2;

-      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);

-    }

-    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }

+  int Width() const { return GET_PARAM(0); }

+  int Height() const { return GET_PARAM(1); }

+  int BorderLeft() const {

+    const int center = (kOuterBlockSize - Width()) / 2;

+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);

+  }

+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }

-    bool IsIndexInBorder(int i) {

-      return (i < BorderTop() * kOuterBlockSize ||

-              i >= (BorderTop() + Height()) * kOuterBlockSize ||

-              i % kOuterBlockSize < BorderLeft() ||

-              i % kOuterBlockSize >= (BorderLeft() + Width()));

-    }

+  bool IsIndexInBorder(int i) {

+    return (i < BorderTop() * kOuterBlockSize ||

+            i >= (BorderTop() + Height()) * kOuterBlockSize ||

+            i % kOuterBlockSize < BorderLeft() ||

+            i % kOuterBlockSize >= (BorderLeft() + Width()));

+  }

-    virtual void SetUp() {

-      UUT_ = GET_PARAM(2);

-      memset(input_, 0, sizeof(input_));

-      /* Set up guard blocks for an inner block cetered in the outer block */

-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {

-        if (IsIndexInBorder(i))

-          output_[i] = 255;

-        else

-          output_[i] = 0;

-      }

-      ::libvpx_test::ACMRandom prng;

-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)

-        input_[i] = prng.Rand8();

+  virtual void SetUp() {

+    UUT_ = GET_PARAM(2);

+    /* Set up guard blocks for an inner block cetered in the outer block */

+    for (int i = 0; i < kOutputBufferSize; ++i) {

+      if (IsIndexInBorder(i))

+        output_[i] = 255;

+      else

+        output_[i] = 0;

-    void CheckGuardBlocks() {

-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {

-        if (IsIndexInBorder(i))

-          EXPECT_EQ(255, output_[i]);

-      }

-    }

+    ::libvpx_test::ACMRandom prng;

+    for (int i = 0; i < kInputBufferSize; ++i)

+      input_[i] = prng.Rand8Extremes();

+  }

-    uint8_t* input() {

-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();

+  void CheckGuardBlocks() {

+    for (int i = 0; i < kOutputBufferSize; ++i) {

+      if (IsIndexInBorder(i))

+        EXPECT_EQ(255, output_[i]);

+  }

-    uint8_t* output() {

-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();

-    }

+  uint8_t* input() const {

+    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();

+  }

-    const ConvolveFunctions* UUT_;

-    static uint8_t* input_;

-    static uint8_t* output_;

+  uint8_t* output() const {

+    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();

+  }

+  const ConvolveFunctions* UUT_;

+  static uint8_t* input_;

+  static uint8_t* output_;

};

 uint8_t* ConvolveTest::input_ = NULL;

 uint8_t* ConvolveTest::output_ = NULL;

@@ -303,12 +303,34 @@

 const int16_t (*kTestFilterList[])[8] = {

   vp9_bilinear_filters,

-  vp9_sub_pel_filters_6,

   vp9_sub_pel_filters_8,

   vp9_sub_pel_filters_8s,

   vp9_sub_pel_filters_8lp

};

+const int kNumFilterBanks = sizeof(kTestFilterList) /

+                            sizeof(kTestFilterList[0]);

+const int kNumFilters = 16;

+TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {

+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {

+    const int16_t (*filters)[8] = kTestFilterList[filter_bank];

+    for (int i = 0; i < kNumFilters; i++) {

+      const int p0 = filters[i][0] + filters[i][1];

+      const int p1 = filters[i][2] + filters[i][3];

+      const int p2 = filters[i][4] + filters[i][5];

+      const int p3 = filters[i][6] + filters[i][7];

+      EXPECT_LE(p0, 128);

+      EXPECT_LE(p1, 128);

+      EXPECT_LE(p2, 128);

+      EXPECT_LE(p3, 128);

+      EXPECT_LE(p0 + p3, 128);

+      EXPECT_LE(p0 + p3 + p1, 128);

+      EXPECT_LE(p0 + p3 + p1 + p2, 128);

+      EXPECT_EQ(p0 + p1 + p2 + p3, 128);

+    }

+  }

+}

 const int16_t kInvalidFilter[8] = { 0 };

 TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {

@@ -316,12 +338,9 @@

   uint8_t* const out = output();

   uint8_t ref[kOutputStride * kMaxDimension];

-  const int kNumFilterBanks = sizeof(kTestFilterList) /

-      sizeof(kTestFilterList[0]);

   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {

     const int16_t (*filters)[8] = kTestFilterList[filter_bank];

-    const int kNumFilters = 16;

     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {

       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {

@@ -368,7 +387,7 @@

   ::libvpx_test::ACMRandom prng;

   for (int y = 0; y < Height(); ++y) {

     for (int x = 0; x < Width(); ++x) {

-      const uint8_t r = prng.Rand8();

+      const uint8_t r = prng.Rand8Extremes();

       out[y * kOutputStride + x] = r;

       ref[y * kOutputStride + x] = r;

@@ -440,6 +459,7 @@

 TEST_P(ConvolveTest, ChangeFilterWorks) {

   uint8_t* const in = input();

   uint8_t* const out = output();

+  const int kPixelSelected = 4;

   REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,

                                  kChangeFilters[8], 17, kChangeFilters[4], 16,

@@ -446,10 +466,10 @@

                                  Width(), Height()));

   for (int x = 0; x < Width(); ++x) {

-    if (x < 8)

-      ASSERT_EQ(in[4], out[x]) << "x == " << x;

-    else

-      ASSERT_EQ(in[12], out[x]) << "x == " << x;

+    const int kQ4StepAdjust = x >> 4;

+    const int kFilterPeriodAdjust = (x >> 3) << 3;

+    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;

+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;

   REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,

@@ -457,10 +477,10 @@

                                  Width(), Height()));

   for (int y = 0; y < Height(); ++y) {

-    if (y < 8)

-      ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y;

-    else

-      ASSERT_EQ(in[12 * kInputStride], out[y * kOutputStride]) << "y == " << y;

+    const int kQ4StepAdjust = y >> 4;

+    const int kFilterPeriodAdjust = (y >> 3) << 3;

+    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;

+    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;

   REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,

@@ -468,9 +488,13 @@

                                   Width(), Height()));

   for (int y = 0; y < Height(); ++y) {

+    const int kQ4StepAdjustY = y >> 4;

+    const int kFilterPeriodAdjustY = (y >> 3) << 3;

+    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;

     for (int x = 0; x < Width(); ++x) {

-      const int ref_x = x < 8 ? 4 : 12;

-      const int ref_y = y < 8 ? 4 : 12;

+      const int kQ4StepAdjustX = x >> 4;

+      const int kFilterPeriodAdjustX = (x >> 3) << 3;

+      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;

       ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])

           << "x == " << x << ", y == " << y;

@@ -489,10 +513,17 @@

 INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(

     make_tuple(4, 4, &convolve8_c),

     make_tuple(8, 4, &convolve8_c),

+    make_tuple(4, 8, &convolve8_c),

     make_tuple(8, 8, &convolve8_c),

     make_tuple(16, 8, &convolve8_c),

-    make_tuple(16, 16, &convolve8_c)));

-}

+    make_tuple(8, 16, &convolve8_c),

+    make_tuple(16, 16, &convolve8_c),

+    make_tuple(32, 16, &convolve8_c),

+    make_tuple(16, 32, &convolve8_c),

+    make_tuple(32, 32, &convolve8_c),

+    make_tuple(64, 32, &convolve8_c),

+    make_tuple(32, 64, &convolve8_c),

+    make_tuple(64, 64, &convolve8_c)));

 #if HAVE_SSSE3

 const ConvolveFunctions convolve8_ssse3(

@@ -503,7 +534,16 @@

 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(

     make_tuple(4, 4, &convolve8_ssse3),

     make_tuple(8, 4, &convolve8_ssse3),

+    make_tuple(4, 8, &convolve8_ssse3),

     make_tuple(8, 8, &convolve8_ssse3),

     make_tuple(16, 8, &convolve8_ssse3),

-    make_tuple(16, 16, &convolve8_ssse3)));

+    make_tuple(8, 16, &convolve8_ssse3),

+    make_tuple(16, 16, &convolve8_ssse3),

+    make_tuple(32, 16, &convolve8_ssse3),

+    make_tuple(16, 32, &convolve8_ssse3),

+    make_tuple(32, 32, &convolve8_ssse3),

+    make_tuple(64, 32, &convolve8_ssse3),

+    make_tuple(32, 64, &convolve8_ssse3),

+    make_tuple(64, 64, &convolve8_ssse3)));

 #endif

+}  // namespace

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -17,6 +17,7 @@

 extern "C" {

 #include "vp9/common/vp9_entropy.h"

 #include "vp9_rtcd.h"

+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);

 #include "acm_random.h"

@@ -269,19 +270,23 @@

   const int count_test_block = 1000;

   for (int i = 0; i < count_test_block; ++i) {

     int16_t in[256], coeff[256];

-    int16_t out_c[256];

+    uint8_t dst[256], src[256];

     double out_r[256];

+    for (int j = 0; j < 256; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 256; ++j)

-      in[j] = rnd.Rand8() - rnd.Rand8();

+      in[j] = src[j] - dst[j];

     reference_16x16_dct_2d(in, out_r);

     for (int j = 0; j < 256; j++)

       coeff[j] = round(out_r[j]);

-    vp9_short_idct16x16_c(coeff, out_c, 32);

+    vp9_short_idct16x16_add_c(coeff, dst, 16);

     for (int j = 0; j < 256; ++j) {

-      const int diff = out_c[j] - in[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       EXPECT_GE(1, error)

           << "Error: 16x16 IDCT has error " << error

@@ -289,7 +294,7 @@

-#if 1

 // we need enable fdct test once we re-do the 16 point fdct.

 TEST(VP9Fdct16x16Test, AccuracyCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

@@ -299,18 +304,22 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[256];

     int16_t test_temp_block[256];

-    int16_t test_output_block[256];

+    uint8_t dst[256], src[256];

+    for (int j = 0; j < 256; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 256; ++j)

-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+      test_input_block[j] = src[j] - dst[j];

     const int pitch = 32;

     vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);

-    vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);

     for (int j = 0; j < 256; ++j) {

-      const int diff = test_input_block[j] - test_output_block[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       if (max_error < error)

         max_error = error;

@@ -354,6 +363,4 @@

-#endif

 }  // namespace

--- a/test/dct32x32_test.cc

+++ b/test/dct32x32_test.cc

@@ -18,7 +18,7 @@

 #include "vp9/common/vp9_entropy.h"

 #include "./vp9_rtcd.h"

   void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);

-  void vp9_short_idct32x32_c(short *input, short *output, int pitch);

+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);

 #include "test/acm_random.h"

@@ -91,28 +91,31 @@

 TEST(VP9Idct32x32Test, AccuracyCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   const int count_test_block = 1000;

   for (int i = 0; i < count_test_block; ++i) {

     int16_t in[1024], coeff[1024];

-    int16_t out_c[1024];

+    uint8_t dst[1024], src[1024];

     double out_r[1024];

+    for (int j = 0; j < 1024; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 1024; ++j)

-      in[j] = rnd.Rand8() - rnd.Rand8();

+      in[j] = src[j] - dst[j];

     reference_32x32_dct_2d(in, out_r);

     for (int j = 0; j < 1024; j++)

       coeff[j] = round(out_r[j]);

-    vp9_short_idct32x32_c(coeff, out_c, 64);

+    vp9_short_idct32x32_add_c(coeff, dst, 32);

     for (int j = 0; j < 1024; ++j) {

-      const int diff = out_c[j] - in[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       EXPECT_GE(1, error)

-          << "Error: 3x32 IDCT has error " << error

+          << "Error: 32x32 IDCT has error " << error

           << " at index " << j;

@@ -126,18 +129,22 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[1024];

     int16_t test_temp_block[1024];

-    int16_t test_output_block[1024];

+    uint8_t dst[1024], src[1024];

+    for (int j = 0; j < 1024; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 1024; ++j)

-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+      test_input_block[j] = src[j] - dst[j];

     const int pitch = 64;

     vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);

-    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);

     for (int j = 0; j < 1024; ++j) {

-      const unsigned diff = test_input_block[j] - test_output_block[j];

+      const unsigned diff = dst[j] - src[j];

       const unsigned error = diff * diff;

       if (max_error < error)

         max_error = error;

--- a/test/encode_test_driver.h

+++ b/test/encode_test_driver.h

@@ -10,9 +10,10 @@

 #ifndef TEST_ENCODE_TEST_DRIVER_H_

 #define TEST_ENCODE_TEST_DRIVER_H_

-#include "./vpx_config.h"

 #include <string>

 #include <vector>

+#include "./vpx_config.h"

 #include "third_party/googletest/src/include/gtest/gtest.h"

 #include "vpx/vpx_encoder.h"

@@ -46,7 +47,7 @@

 class CxDataIterator {

  public:

   explicit CxDataIterator(vpx_codec_ctx_t *encoder)

-    : encoder_(encoder), iter_(NULL) {}

+      : encoder_(encoder), iter_(NULL) {}

   const vpx_codec_cx_pkt_t *Next() {

     return vpx_codec_get_cx_data(encoder_, &iter_);

@@ -92,7 +93,7 @@

     memset(&encoder_, 0, sizeof(encoder_));

-  ~Encoder() {

+  virtual ~Encoder() {

     vpx_codec_destroy(&encoder_);

--- a/test/error_resilience_test.cc

+++ b/test/error_resilience_test.cc

@@ -206,11 +206,17 @@

   // reset previously set error/droppable frames

   Reset();

+#if 0

+  // TODO(jkoleszar): This test is disabled for the time being as too

+  // sensitive. It's not clear how to set a reasonable threshold for

+  // this behavior.

   // Now set an arbitrary set of error frames that are non-droppable

   unsigned int num_error_frames = 3;

   unsigned int error_frame_list[] = {3, 10, 20};

   SetErrorFrames(num_error_frames, error_frame_list);

   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

   // Test that dropping an arbitrary set of inter frames does not hurt too much

   // Note the Average Mismatch PSNR is the average of the PSNR between

   // decoded frame and encoder's version of the same frame for all frames

@@ -219,6 +225,7 @@

   std::cout << "             Mismatch PSNR: "

             << psnr_resilience_mismatch << "\n";

   EXPECT_GT(psnr_resilience_mismatch, 20.0);

+#endif

 VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTest, ONE_PASS_TEST_MODES);

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -96,11 +96,15 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[16];

     int16_t test_temp_block[16];

-    int16_t test_output_block[16];

+    uint8_t dst[16], src[16];

+    for (int j = 0; j < 16; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 16; ++j)

-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+      test_input_block[j] = src[j] - dst[j];

     // TODO(Yaowu): this should be converted to a parameterized test

     // to test optimized versions of this function.

@@ -120,10 +124,10 @@

     // Because the bitstream is not frozen yet, use the idct in the codebase.

-    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);

     for (int j = 0; j < 16; ++j) {

-      const int diff = test_input_block[j] - test_output_block[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       if (max_error < error)

         max_error = error;

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -16,6 +16,7 @@

 extern "C" {

 #include "vp9_rtcd.h"

+void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);

 #include "acm_random.h"

@@ -100,11 +101,15 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[64];

     int16_t test_temp_block[64];

-    int16_t test_output_block[64];

+    uint8_t dst[64], src[64];

+    for (int j = 0; j < 64; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 64; ++j)

-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+      test_input_block[j] = src[j] - dst[j];

     const int pitch = 16;

     vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);

@@ -119,10 +124,10 @@

           test_temp_block[j] *= 4;

-    vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);

     for (int j = 0; j < 64; ++j) {

-      const int diff = test_input_block[j] - test_output_block[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       if (max_error < error)

         max_error = error;

@@ -145,18 +150,22 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[64];

     int16_t test_temp_block[64];

-    int16_t test_output_block[64];

+    uint8_t dst[64], src[64];

-    // Initialize a test block with input range {-255, 255}.

+    for (int j = 0; j < 64; ++j) {

+      src[j] = rnd.Rand8() % 2 ? 255 : 0;

+      dst[j] = src[j] > 0 ? 0 : 255;

+    }

+    // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 64; ++j)

-      test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256;

+      test_input_block[j] = src[j] - dst[j];

     const int pitch = 16;

     vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);

-    vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);

     for (int j = 0; j < 64; ++j) {

-      const int diff = test_input_block[j] - test_output_block[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       if (max_error < error)

         max_error = error;

--- a/test/i420_video_source.h

+++ b/test/i420_video_source.h

@@ -83,7 +83,7 @@

   void SetSize(unsigned int width, unsigned int height) {

     if (width != width_ || height != height_) {

       vpx_img_free(img_);

-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 1);

+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 1);

       ASSERT_TRUE(img_ != NULL);

       width_ = width;

       height_ = height;

--- a/test/idct8x8_test.cc

+++ b/test/idct8x8_test.cc

@@ -112,20 +112,23 @@

   const int count_test_block = 10000;

   for (int i = 0; i < count_test_block; ++i) {

     int16_t input[64], coeff[64];

-    int16_t output_c[64];

     double output_r[64];

+    uint8_t dst[64], src[64];

+    for (int j = 0; j < 64; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 64; ++j)

-      input[j] = rnd.Rand8() - rnd.Rand8();

+      input[j] = src[j] - dst[j];

-    const int pitch = 16;

     reference_dct_2d(input, output_r);

     for (int j = 0; j < 64; ++j)

       coeff[j] = round(output_r[j]);

-    vp9_short_idct8x8_c(coeff, output_c, pitch);

+    vp9_short_idct8x8_add_c(coeff, dst, 8);

     for (int j = 0; j < 64; ++j) {

-      const int diff = output_c[j] -input[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       EXPECT_GE(1, error)

           << "Error: 8x8 FDCT/IDCT has error " << error

--- a/test/superframe_test.cc

+++ b/test/superframe_test.cc

@@ -30,7 +30,7 @@

   virtual void TearDown() {

-    delete modified_buf_;

+    delete[] modified_buf_;

   virtual bool Continue() const {

@@ -59,7 +59,7 @@

         buffer[pkt->data.frame.sz - index_sz] == marker) {

       // frame is a superframe. strip off the index.

       if (modified_buf_)

-        delete modified_buf_;

+        delete[] modified_buf_;

       modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz];

       memcpy(modified_buf_, pkt->data.frame.buf,

              pkt->data.frame.sz - index_sz);

--- a/test/test-data.sha1

+++ b/test/test-data.sha1

@@ -1,4 +1,5 @@

 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv

+b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv

 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf

 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf

 906b4c1e99eb734504c504b3f1ad8052137ce672  vp80-00-comprehensive-003.ivf

@@ -120,4 +121,4 @@

 41d70bb5fa45bc88da1604a0af466930b8dd77b5  vp80-05-sharpness-1438.ivf.md5

 086c56378df81b6cee264d7540a7b8f2b405c7a4  vp80-05-sharpness-1439.ivf.md5

 d32dc2c4165eb266ea4c23c14a45459b363def32  vp80-05-sharpness-1440.ivf.md5

-8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5

\ No newline at end of file

+8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5

--- a/test/test.mk

+++ b/test/test.mk

@@ -22,6 +22,7 @@

 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc

+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c

@@ -92,6 +93,7 @@

 ## TEST DATA

##

 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv

+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf

--- a/test/tile_independence_test.cc

+++ b/test/tile_independence_test.cc

@@ -56,7 +56,13 @@

   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,

                  ::libvpx_test::MD5 *md5) {

-    dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz);

+    const vpx_codec_err_t res =

+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),

+                         pkt->data.frame.sz);

+    if (res != VPX_CODEC_OK) {

+      abort_ = true;

+      ASSERT_EQ(VPX_CODEC_OK, res);

+    }

     const vpx_image_t *img = dec->GetDxData().Next();

     md5->Add(img);

--- a/test/variance_test.cc

+++ b/test/variance_test.cc

@@ -188,11 +188,11 @@

 #endif

 #if HAVE_SSE2

-const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_wmt;

-const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_wmt;

-const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_wmt;

-const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_wmt;

-const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_wmt;

+const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2;

+const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2;

+const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2;

+const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2;

+const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2;

 INSTANTIATE_TEST_CASE_P(

     SSE2, VP9VarianceTest,

     ::testing::Values(make_tuple(4, 4, variance4x4_wmt),

--- a/test/video_source.h

+++ b/test/video_source.h

@@ -103,7 +103,7 @@

     if (width != width_ || height != height_) {

       vpx_img_free(img_);

       raw_sz_ = ((width + 31)&~31) * height * 3 / 2;

-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 32);

+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 32);

       width_ = width;

       height_ = height;

--- a/test/vp9_boolcoder_test.cc

+++ b/test/vp9_boolcoder_test.cc

@@ -52,7 +52,7 @@

         const int random_seed = 6432;

         const int buffer_size = 10000;

         ACMRandom bit_rnd(random_seed);

-        BOOL_CODER bw;

+        vp9_writer bw;

         uint8_t bw_buffer[buffer_size];

         vp9_start_encode(&bw, bw_buffer);

@@ -63,13 +63,16 @@

           } else if (bit_method == 3) {

             bit = bit_rnd(2);

-          encode_bool(&bw, bit, static_cast<int>(probas[i]));

+          vp9_write(&bw, bit, static_cast<int>(probas[i]));

         vp9_stop_encode(&bw);

-        BOOL_DECODER br;

-        vp9_start_decode(&br, bw_buffer, buffer_size);

+        // First bit should be zero

+        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);

+        vp9_reader br;

+        vp9_reader_init(&br, bw_buffer, buffer_size);

         bit_rnd.Reset(random_seed);

         for (int i = 0; i < bits_to_test; ++i) {

           if (bit_method == 2) {

@@ -77,7 +80,7 @@

           } else if (bit_method == 3) {

             bit = bit_rnd(2);

-          GTEST_ASSERT_EQ(decode_bool(&br, probas[i]), bit)

+          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)

               << "pos: " << i << " / " << bits_to_test

               << " bit_method: " << bit_method

               << " method: " << method;

--- a/third_party/libyuv/source/scale.c

+++ b/third_party/libyuv/source/scale.c

@@ -632,7 +632,7 @@

   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

 #endif

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)

+#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER)

 #define HAS_SCALEROWDOWN2_SSE2

 // Reads 32 pixels, throws half away and writes 16 pixels.

--- a/tools/cpplint.py

+++ b/tools/cpplint.py

@@ -53,12 +53,8 @@

 #  - Check for 0 in char context (should be '\0')

 #  - Check for camel-case method name conventions for methods

 #    that are not simple inline getters and setters

-#  - Check that base classes have virtual destructors

-#    put "  // namespace" after } that closes a namespace, with

-#    namespace's name after 'namespace' if it is named.

 #  - Do not indent namespace contents

 #  - Avoid inlining non-trivial constructors in header files

-#    include base/basictypes.h if DISALLOW_EVIL_CONSTRUCTORS is used

 #  - Check for old-school (void) cast for call-sites of functions

 #    ignored return value

 #  - Check gUnit usage of anonymous namespace

@@ -80,6 +76,7 @@

"""

 import codecs

+import copy

 import getopt

 import math  # for log

 import os

@@ -139,6 +136,22 @@

       the top-level categories like 'build' and 'whitespace' will

       also be printed. If 'detailed' is provided, then a count

       is provided for each category like 'build/class'.

+    root=subdir

+      The root directory used for deriving header guard CPP variable.

+      By default, the header guard CPP variable is calculated as the relative

+      path to the directory that contains .git, .hg, or .svn.  When this flag

+      is specified, the relative path is calculated from the specified

+      directory. If the specified directory does not exist, this flag is

+      ignored.

+      Examples:

+        Assuing that src/.git exists, the header guard CPP variables for

+        src/chrome/browser/ui/browser.h are:

+        No flag => CHROME_BROWSER_UI_BROWSER_H_

+        --root=chrome => BROWSER_UI_BROWSER_H_

+        --root=chrome/browser => UI_BROWSER_H_

"""

 # We categorize each error message we print.  Here are the categories.

@@ -161,6 +174,7 @@

   'build/printf_format',

   'build/storage_class',

   'legal/copyright',

+  'readability/alt_tokens',

   'readability/braces',

   'readability/casting',

   'readability/check',

@@ -169,6 +183,7 @@

   'readability/function',

   'readability/multiline_comment',

   'readability/multiline_string',

+  'readability/namespace',

   'readability/nolint',

   'readability/streams',

   'readability/todo',

@@ -189,13 +204,14 @@

   'runtime/sizeof',

   'runtime/string',

   'runtime/threadsafe_fn',

-  'runtime/virtual',

   'whitespace/blank_line',

   'whitespace/braces',

   'whitespace/comma',

   'whitespace/comments',

+  'whitespace/empty_loop_body',

   'whitespace/end_of_line',

   'whitespace/ending_newline',

+  'whitespace/forcolon',

   'whitespace/indent',

   'whitespace/labels',

   'whitespace/line_length',

@@ -278,7 +294,35 @@

   _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement

   _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement

+# Alternative tokens and their replacements.  For full list, see section 2.5

+# Alternative tokens [lex.digraph] in the C++ standard.

+#

+# Digraphs (such as '%:') are not included here since it's a mess to

+# match those on a word boundary.

+_ALT_TOKEN_REPLACEMENT = {

+    'and': '&&',

+    'bitor': '|',

+    'or': '||',

+    'xor': '^',

+    'compl': '~',

+    'bitand': '&',

+    'and_eq': '&=',

+    'or_eq': '|=',

+    'xor_eq': '^=',

+    'not': '!',

+    'not_eq': '!='

+    }

+# Compile regular expression that matches all the above keywords.  The "[ =()]"

+# bit is meant to avoid matching these keywords outside of boolean expressions.

+#

+# False positives include C-style multi-line comments (http://go/nsiut )

+# and multi-line strings (http://go/beujw ), but those have always been

+# troublesome for cpplint.

+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(

+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')

 # These constants define types of headers for use with

 # _IncludeState.CheckNextIncludeOrder().

 _C_SYS_HEADER = 1

@@ -287,7 +331,18 @@

 _POSSIBLE_MY_HEADER = 4

 _OTHER_HEADER = 5

+# These constants define the current inline assembly state

+_NO_ASM = 0       # Outside of inline assembly block

+_INSIDE_ASM = 1   # Inside inline assembly block

+_END_ASM = 2      # Last line of inline assembly block

+_BLOCK_ASM = 3    # The whole block is an inline assembly block

+# Match start of assembly blocks

+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'

+                        r'(?:\s+(volatile|__volatile__))?'

+                        r'\s*[{(]')

 _regexp_compile_cache = {}

 # Finds occurrences of NOLINT or NOLINT(...).

@@ -297,6 +352,10 @@

 # on which those errors are expected and should be suppressed.

 _error_suppressions = {}

+# The root directory used for deriving header guard CPP variable.

+# This is set by --root flag.

+_root = None

 def ParseNolintSuppressions(filename, raw_line, linenum, error):

   """Updates the global list of error-suppressions.

@@ -925,7 +984,7 @@

   1) elided member contains lines without strings and comments,

   2) lines member contains lines without comments, and

-  3) raw member contains all the lines without processing.

+  3) raw_lines member contains all the lines without processing.

   All these three members are of <type 'list'>, and of the same length.

"""

@@ -965,6 +1024,29 @@

     return elided

+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):

+  """Find the position just after the matching endchar.

+  Args:

+    line: a CleansedLines line.

+    startpos: start searching at this position.

+    depth: nesting level at startpos.

+    startchar: expression opening character.

+    endchar: expression closing character.

+  Returns:

+    Index just after endchar.

+  """

+  for i in xrange(startpos, len(line)):

+    if line[i] == startchar:

+      depth += 1

+    elif line[i] == endchar:

+      depth -= 1

+      if depth == 0:

+        return i + 1

+  return -1

 def CloseExpression(clean_lines, linenum, pos):

   """If input points to ( or { or [, finds the position that closes it.

@@ -991,18 +1073,23 @@

   if startchar == '[': endchar = ']'

   if startchar == '{': endchar = '}'

-  num_open = line.count(startchar) - line.count(endchar)

-  while linenum < clean_lines.NumLines() and num_open > 0:

+  # Check first line

+  end_pos = FindEndOfExpressionInLine(line, pos, 0, startchar, endchar)

+  if end_pos > -1:

+    return (line, linenum, end_pos)

+  tail = line[pos:]

+  num_open = tail.count(startchar) - tail.count(endchar)

+  while linenum < clean_lines.NumLines() - 1:

     linenum += 1

     line = clean_lines.elided[linenum]

-    num_open += line.count(startchar) - line.count(endchar)

-  # OK, now find the endchar that actually got us back to even

-  endpos = len(line)

-  while num_open >= 0:

-    endpos = line.rfind(')', 0, endpos)

-    num_open -= 1                 # chopped off another )

-  return (line, linenum, endpos + 1)

+    delta = line.count(startchar) - line.count(endchar)

+    if num_open + delta <= 0:

+      return (line, linenum,

+              FindEndOfExpressionInLine(line, 0, num_open, startchar, endchar))

+    num_open += delta

+  # Did not find endchar before end of file, give up

+  return (line, clean_lines.NumLines(), -1)

 def CheckForCopyright(filename, lines, error):

   """Logs an error if no Copyright message appears at the top of the file."""

@@ -1032,9 +1119,13 @@

   # Restores original filename in case that cpplint is invoked from Emacs's

   # flymake.

   filename = re.sub(r'_flymake\.h$', '.h', filename)

+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)

   fileinfo = FileInfo(filename)

-  return re.sub(r'[-./\s]', '_', fileinfo.RepositoryName()).upper() + '_'

+  file_path_from_root = fileinfo.RepositoryName()

+  if _root:

+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)

+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'

 def CheckForHeaderGuard(filename, lines, error):

@@ -1259,17 +1350,55 @@

           'Changing pointer instead of value (or unused value of operator*).')

-class _ClassInfo(object):

+class _BlockInfo(object):

+  """Stores information about a generic block of code."""

+  def __init__(self, seen_open_brace):

+    self.seen_open_brace = seen_open_brace

+    self.open_parentheses = 0

+    self.inline_asm = _NO_ASM

+  def CheckBegin(self, filename, clean_lines, linenum, error):

+    """Run checks that applies to text up to the opening brace.

+    This is mostly for checking the text after the class identifier

+    and the "{", usually where the base class is specified.  For other

+    blocks, there isn't much to check, so we always pass.

+    Args:

+      filename: The name of the current file.

+      clean_lines: A CleansedLines instance containing the file.

+      linenum: The number of the line to check.

+      error: The function to call with any errors found.

+    """

+    pass

+  def CheckEnd(self, filename, clean_lines, linenum, error):

+    """Run checks that applies to text after the closing brace.

+    This is mostly used for checking end of namespace comments.

+    Args:

+      filename: The name of the current file.

+      clean_lines: A CleansedLines instance containing the file.

+      linenum: The number of the line to check.

+      error: The function to call with any errors found.

+    """

+    pass

+class _ClassInfo(_BlockInfo):

   """Stores information about a class."""

-  def __init__(self, name, clean_lines, linenum):

+  def __init__(self, name, class_or_struct, clean_lines, linenum):

+    _BlockInfo.__init__(self, False)

     self.name = name

-    self.linenum = linenum

-    self.seen_open_brace = False

+    self.starting_linenum = linenum

     self.is_derived = False

-    self.virtual_method_linenumber = None

-    self.has_virtual_destructor = False

-    self.brace_depth = 0

+    if class_or_struct == 'struct':

+      self.access = 'public'

+    else:

+      self.access = 'private'

     # Try to find the end of the class.  This will be confused by things like:

     #   class A {

@@ -1279,26 +1408,324 @@

     self.last_line = 0

     depth = 0

     for i in range(linenum, clean_lines.NumLines()):

-      line = clean_lines.lines[i]

+      line = clean_lines.elided[i]

       depth += line.count('{') - line.count('}')

       if not depth:

         self.last_line = i

         break

+  def CheckBegin(self, filename, clean_lines, linenum, error):

+    # Look for a bare ':'

+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):

+      self.is_derived = True

-class _ClassState(object):

-  """Holds the current state of the parse relating to class declarations.

-  It maintains a stack of _ClassInfos representing the parser's guess

-  as to the current nesting of class declarations. The innermost class

-  is at the top (back) of the stack. Typically, the stack will either

-  be empty or have exactly one entry.

-  """

+class _NamespaceInfo(_BlockInfo):

+  """Stores information about a namespace."""

+  def __init__(self, name, linenum):

+    _BlockInfo.__init__(self, False)

+    self.name = name or ''

+    self.starting_linenum = linenum

+  def CheckEnd(self, filename, clean_lines, linenum, error):

+    """Check end of namespace comments."""

+    line = clean_lines.raw_lines[linenum]

+    # Check how many lines is enclosed in this namespace.  Don't issue

+    # warning for missing namespace comments if there aren't enough

+    # lines.  However, do apply checks if there is already an end of

+    # namespace comment and it's incorrect.

+    #

+    # TODO(unknown): We always want to check end of namespace comments

+    # if a namespace is large, but sometimes we also want to apply the

+    # check if a short namespace contained nontrivial things (something

+    # other than forward declarations).  There is currently no logic on

+    # deciding what these nontrivial things are, so this check is

+    # triggered by namespace size only, which works most of the time.

+    if (linenum - self.starting_linenum < 10

+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):

+      return

+    # Look for matching comment at end of namespace.

+    #

+    # Note that we accept C style "/* */" comments for terminating

+    # namespaces, so that code that terminate namespaces inside

+    # preprocessor macros can be cpplint clean.  Example: http://go/nxpiz

+    #

+    # We also accept stuff like "// end of namespace <name>." with the

+    # period at the end.

+    #

+    # Besides these, we don't accept anything else, otherwise we might

+    # get false negatives when existing comment is a substring of the

+    # expected namespace.  Example: http://go/ldkdc, http://cl/23548205

+    if self.name:

+      # Named namespace

+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +

+                    r'[\*/\.\\\s]*$'),

+                   line):

+        error(filename, linenum, 'readability/namespace', 5,

+              'Namespace should be terminated with "// namespace %s"' %

+              self.name)

+    else:

+      # Anonymous namespace

+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):

+        error(filename, linenum, 'readability/namespace', 5,

+              'Namespace should be terminated with "// namespace"')

+class _PreprocessorInfo(object):

+  """Stores checkpoints of nesting stacks when #if/#else is seen."""

+  def __init__(self, stack_before_if):

+    # The entire nesting stack before #if

+    self.stack_before_if = stack_before_if

+    # The entire nesting stack up to #else

+    self.stack_before_else = []

+    # Whether we have already seen #else or #elif

+    self.seen_else = False

+class _NestingState(object):

+  """Holds states related to parsing braces."""

   def __init__(self):

-    self.classinfo_stack = []

+    # Stack for tracking all braces.  An object is pushed whenever we

+    # see a "{", and popped when we see a "}".  Only 3 types of

+    # objects are possible:

+    # - _ClassInfo: a class or struct.

+    # - _NamespaceInfo: a namespace.

+    # - _BlockInfo: some other type of block.

+    self.stack = []

-  def CheckFinished(self, filename, error):

+    # Stack of _PreprocessorInfo objects.

+    self.pp_stack = []

+  def SeenOpenBrace(self):

+    """Check if we have seen the opening brace for the innermost block.

+    Returns:

+      True if we have seen the opening brace, False if the innermost

+      block is still expecting an opening brace.

+    """

+    return (not self.stack) or self.stack[-1].seen_open_brace

+  def InNamespaceBody(self):

+    """Check if we are currently one level inside a namespace body.

+    Returns:

+      True if top of the stack is a namespace block, False otherwise.

+    """

+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)

+  def UpdatePreprocessor(self, line):

+    """Update preprocessor stack.

+    We need to handle preprocessors due to classes like this:

+      #ifdef SWIG

+      struct ResultDetailsPageElementExtensionPoint {

+      #else

+      struct ResultDetailsPageElementExtensionPoint : public Extension {

+      #endif

+    (see http://go/qwddn for original example)

+    We make the following assumptions (good enough for most files):

+    - Preprocessor condition evaluates to true from #if up to first

+      #else/#elif/#endif.

+    - Preprocessor condition evaluates to false from #else/#elif up

+      to #endif.  We still perform lint checks on these lines, but

+      these do not affect nesting stack.

+    Args:

+      line: current line to check.

+    """

+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):

+      # Beginning of #if block, save the nesting stack here.  The saved

+      # stack will allow us to restore the parsing state in the #else case.

+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))

+    elif Match(r'^\s*#\s*(else|elif)\b', line):

+      # Beginning of #else block

+      if self.pp_stack:

+        if not self.pp_stack[-1].seen_else:

+          # This is the first #else or #elif block.  Remember the

+          # whole nesting stack up to this point.  This is what we

+          # keep after the #endif.

+          self.pp_stack[-1].seen_else = True

+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)

+        # Restore the stack to how it was before the #if

+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)

+      else:

+        # TODO(unknown): unexpected #else, issue warning?

+        pass

+    elif Match(r'^\s*#\s*endif\b', line):

+      # End of #if or #else blocks.

+      if self.pp_stack:

+        # If we saw an #else, we will need to restore the nesting

+        # stack to its former state before the #else, otherwise we

+        # will just continue from where we left off.

+        if self.pp_stack[-1].seen_else:

+          # Here we can just use a shallow copy since we are the last

+          # reference to it.

+          self.stack = self.pp_stack[-1].stack_before_else

+        # Drop the corresponding #if

+        self.pp_stack.pop()

+      else:

+        # TODO(unknown): unexpected #endif, issue warning?

+        pass

+  def Update(self, filename, clean_lines, linenum, error):

+    """Update nesting state with current line.

+    Args:

+      filename: The name of the current file.

+      clean_lines: A CleansedLines instance containing the file.

+      linenum: The number of the line to check.

+      error: The function to call with any errors found.

+    """

+    line = clean_lines.elided[linenum]

+    # Update pp_stack first

+    self.UpdatePreprocessor(line)

+    # Count parentheses.  This is to avoid adding struct arguments to

+    # the nesting stack.

+    if self.stack:

+      inner_block = self.stack[-1]

+      depth_change = line.count('(') - line.count(')')

+      inner_block.open_parentheses += depth_change

+      # Also check if we are starting or ending an inline assembly block.

+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):

+        if (depth_change != 0 and

+            inner_block.open_parentheses == 1 and

+            _MATCH_ASM.match(line)):

+          # Enter assembly block

+          inner_block.inline_asm = _INSIDE_ASM

+        else:

+          # Not entering assembly block.  If previous line was _END_ASM,

+          # we will now shift to _NO_ASM state.

+          inner_block.inline_asm = _NO_ASM

+      elif (inner_block.inline_asm == _INSIDE_ASM and

+            inner_block.open_parentheses == 0):

+        # Exit assembly block

+        inner_block.inline_asm = _END_ASM

+    # Consume namespace declaration at the beginning of the line.  Do

+    # this in a loop so that we catch same line declarations like this:

+    #   namespace proto2 { namespace bridge { class MessageSet; } }

+    while True:

+      # Match start of namespace.  The "\b\s*" below catches namespace

+      # declarations even if it weren't followed by a whitespace, this

+      # is so that we don't confuse our namespace checker.  The

+      # missing spaces will be flagged by CheckSpacing.

+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)

+      if not namespace_decl_match:

+        break

+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)

+      self.stack.append(new_namespace)

+      line = namespace_decl_match.group(2)

+      if line.find('{') != -1:

+        new_namespace.seen_open_brace = True

+        line = line[line.find('{') + 1:]

+    # Look for a class declaration in whatever is left of the line

+    # after parsing namespaces.  The regexp accounts for decorated classes

+    # such as in:

+    #   class LOCKABLE API Object {

+    #   };

+    #

+    # Templates with class arguments may confuse the parser, for example:

+    #   template <class T

+    #             class Comparator = less<T>,

+    #             class Vector = vector<T> >

+    #   class HeapQueue {

+    #

+    # Because this parser has no nesting state about templates, by the

+    # time it saw "class Comparator", it may think that it's a new class.

+    # Nested templates have a similar problem:

+    #   template <

+    #       typename ExportedType,

+    #       typename TupleType,

+    #       template <typename, typename> class ImplTemplate>

+    #

+    # To avoid these cases, we ignore classes that are followed by '=' or '>'

+    class_decl_match = Match(

+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'

+        '(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'

+        '(([^=>]|<[^<>]*>)*)$', line)

+    if (class_decl_match and

+        (not self.stack or self.stack[-1].open_parentheses == 0)):

+      self.stack.append(_ClassInfo(

+          class_decl_match.group(4), class_decl_match.group(2),

+          clean_lines, linenum))

+      line = class_decl_match.group(5)

+    # If we have not yet seen the opening brace for the innermost block,

+    # run checks here.

+    if not self.SeenOpenBrace():

+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)

+    # Update access control if we are inside a class/struct

+    if self.stack and isinstance(self.stack[-1], _ClassInfo):

+      access_match = Match(r'\s*(public|private|protected)\s*:', line)

+      if access_match:

+        self.stack[-1].access = access_match.group(1)

+    # Consume braces or semicolons from what's left of the line

+    while True:

+      # Match first brace, semicolon, or closed parenthesis.

+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)

+      if not matched:

+        break

+      token = matched.group(1)

+      if token == '{':

+        # If namespace or class hasn't seen a opening brace yet, mark

+        # namespace/class head as complete.  Push a new block onto the

+        # stack otherwise.

+        if not self.SeenOpenBrace():

+          self.stack[-1].seen_open_brace = True

+        else:

+          self.stack.append(_BlockInfo(True))

+          if _MATCH_ASM.match(line):

+            self.stack[-1].inline_asm = _BLOCK_ASM

+      elif token == ';' or token == ')':

+        # If we haven't seen an opening brace yet, but we already saw

+        # a semicolon, this is probably a forward declaration.  Pop

+        # the stack for these.

+        #

+        # Similarly, if we haven't seen an opening brace yet, but we

+        # already saw a closing parenthesis, then these are probably

+        # function arguments with extra "class" or "struct" keywords.

+        # Also pop these stack for these.

+        if not self.SeenOpenBrace():

+          self.stack.pop()

+      else:  # token == '}'

+        # Perform end of block checks and pop the stack.

+        if self.stack:

+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)

+          self.stack.pop()

+      line = matched.group(2)

+  def InnermostClass(self):

+    """Get class info on the top of the stack.

+    Returns:

+      A _ClassInfo object if we are inside a class, or None otherwise.

+    """

+    for i in range(len(self.stack), 0, -1):

+      classinfo = self.stack[i - 1]

+      if isinstance(classinfo, _ClassInfo):

+        return classinfo

+    return None

+  def CheckClassFinished(self, filename, error):

     """Checks that all classes have been completely parsed.

     Call this when all lines in a file have been processed.

@@ -1306,17 +1733,18 @@

       filename: The name of the current file.

       error: The function to call with any errors found.

"""

-    if self.classinfo_stack:

-      # Note: This test can result in false positives if #ifdef constructs

-      # get in the way of brace matching. See the testBuildClass test in

-      # cpplint_unittest.py for an example of this.

-      error(filename, self.classinfo_stack[0].linenum, 'build/class', 5,

-            'Failed to find complete declaration of class %s' %

-            self.classinfo_stack[0].name)

+    # Note: This test can result in false positives if #ifdef constructs

+    # get in the way of brace matching. See the testBuildClass test in

+    # cpplint_unittest.py for an example of this.

+    for obj in self.stack:

+      if isinstance(obj, _ClassInfo):

+        error(filename, obj.starting_linenum, 'build/class', 5,

+              'Failed to find complete declaration of class %s' %

+              obj.name)

 def CheckForNonStandardConstructs(filename, clean_lines, linenum,

-                                  class_state, error):

+                                  nesting_state, error):

   """Logs an error if we see certain non-ANSI constructs ignored by gcc-2.

   Complain about several constructs which gcc-2 accepts, but which are

@@ -1329,8 +1757,6 @@

   - text after #endif is not allowed.

   - invalid inner-style forward declaration.

   - >? and <? operators, and their >?= and <?= cousins.

-  - classes with virtual methods need virtual destructors (compiler warning

-    available, but not turned on yet.)

   Additionally, check for constructor/destructor style violations and reference

   members, as it is very convenient to do so while checking for

@@ -1340,8 +1766,8 @@

     filename: The name of the current file.

     clean_lines: A CleansedLines instance containing the file.

     linenum: The number of the line to check.

-    class_state: A _ClassState instance which maintains information about

-                 the current stack of nested class declarations being parsed.

+    nesting_state: A _NestingState instance which maintains information about

+                   the current stack of nested blocks being parsed.

     error: A callable to which errors are reported, which takes 4 arguments:

            filename, line number, error level, and message

"""

@@ -1370,7 +1796,7 @@

   if Search(r'\b(const|volatile|void|char|short|int|long'

             r'|float|double|signed|unsigned'

             r'|schar|u?int8|u?int16|u?int32|u?int64)'

-            r'\s+(auto|register|static|extern|typedef)\b',

+            r'\s+(register|static|extern|typedef)\b',

             line):

     error(filename, linenum, 'build/storage_class', 5,

           'Storage class (static, extern, typedef, etc) should be first.')

@@ -1400,45 +1826,13 @@

           'const string& members are dangerous. It is much better to use '

           'alternatives, such as pointers or simple constants.')

-  # Track class entry and exit, and attempt to find cases within the

-  # class declaration that don't meet the C++ style

-  # guidelines. Tracking is very dependent on the code matching Google

-  # style guidelines, but it seems to perform well enough in testing

-  # to be a worthwhile addition to the checks.

-  classinfo_stack = class_state.classinfo_stack

-  # Look for a class declaration. The regexp accounts for decorated classes

-  # such as in:

-  # class LOCKABLE API Object {

-  # };

-  class_decl_match = Match(

-      r'\s*(template\s*<[\w\s<>,:]*>\s*)?'

-      '(class|struct)\s+([A-Z_]+\s+)*(\w+(::\w+)*)', line)

-  if class_decl_match:

-    classinfo_stack.append(_ClassInfo(

-        class_decl_match.group(4), clean_lines, linenum))

-  # Everything else in this function uses the top of the stack if it's

-  # not empty.

-  if not classinfo_stack:

+  # Everything else in this function operates on class declarations.

+  # Return early if the top of the nesting stack is not a class, or if

+  # the class head is not completed yet.

+  classinfo = nesting_state.InnermostClass()

+  if not classinfo or not classinfo.seen_open_brace:

     return

-  classinfo = classinfo_stack[-1]

-  # If the opening brace hasn't been seen look for it and also

-  # parent class declarations.

-  if not classinfo.seen_open_brace:

-    # If the line has a ';' in it, assume it's a forward declaration or

-    # a single-line class declaration, which we won't process.

-    if line.find(';') != -1:

-      classinfo_stack.pop()

-      return

-    classinfo.seen_open_brace = (line.find('{') != -1)

-    # Look for a bare ':'

-    if Search('(^|[^:]):($|[^:])', line):

-      classinfo.is_derived = True

-    if not classinfo.seen_open_brace:

-      return  # Everything else in this function is for after open brace

   # The class may have been declared with namespace or classname qualifiers.

   # The constructor and destructor will not have those qualifiers.

   base_classname = classinfo.name.split('::')[-1]

@@ -1455,36 +1849,7 @@

     error(filename, linenum, 'runtime/explicit', 5,

           'Single-argument constructors should be marked explicit.')

-  # Look for methods declared virtual.

-  if Search(r'\bvirtual\b', line):

-    classinfo.virtual_method_linenumber = linenum

-    # Only look for a destructor declaration on the same line. It would

-    # be extremely unlikely for the destructor declaration to occupy

-    # more than one line.

-    if Search(r'~%s\s*\(' % base_classname, line):

-      classinfo.has_virtual_destructor = True

-  # Look for class end.

-  brace_depth = classinfo.brace_depth

-  brace_depth = brace_depth + line.count('{') - line.count('}')

-  if brace_depth <= 0:

-    classinfo = classinfo_stack.pop()

-    # Try to detect missing virtual destructor declarations.

-    # For now, only warn if a non-derived class with virtual methods lacks

-    # a virtual destructor. This is to make it less likely that people will

-    # declare derived virtual destructors without declaring the base

-    # destructor virtual.

-    if ((classinfo.virtual_method_linenumber is not None) and

-        (not classinfo.has_virtual_destructor) and

-        (not classinfo.is_derived)):  # Only warn for base classes

-      error(filename, classinfo.linenum, 'runtime/virtual', 4,

-            'The class %s probably needs a virtual destructor due to '

-            'having virtual method(s), one declared at line %d.'

-            % (classinfo.name, classinfo.virtual_method_linenumber))

-  else:

-    classinfo.brace_depth = brace_depth

 def CheckSpacingForFunctionCall(filename, line, linenum, error):

   """Checks for the correctness of various spacing around function calls.

@@ -1535,7 +1900,8 @@

       error(filename, linenum, 'whitespace/parens', 2,

             'Extra space after (')

     if (Search(r'\w\s+\(', fncall) and

-        not Search(r'#\s*define|typedef', fncall)):

+        not Search(r'#\s*define|typedef', fncall) and

+        not Search(r'\w\s+\((\w+::)?\*\w+\)\(', fncall)):

       error(filename, linenum, 'whitespace/parens', 4,

             'Extra space before ( in function call')

     # If the ) is followed only by a newline or a { + newline, assume it's

@@ -1668,8 +2034,165 @@

       error(filename, linenum, 'whitespace/todo', 2,

             'TODO(my_username) should be followed by a space')

+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):

+  """Checks for improper use of DISALLOW* macros.

-def CheckSpacing(filename, clean_lines, linenum, error):

+  Args:

+    filename: The name of the current file.

+    clean_lines: A CleansedLines instance containing the file.

+    linenum: The number of the line to check.

+    nesting_state: A _NestingState instance which maintains information about

+                   the current stack of nested blocks being parsed.

+    error: The function to call with any errors found.

+  """

+  line = clean_lines.elided[linenum]  # get rid of comments and strings

+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'

+                   r'DISALLOW_EVIL_CONSTRUCTORS|'

+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)

+  if not matched:

+    return

+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):

+    if nesting_state.stack[-1].access != 'private':

+      error(filename, linenum, 'readability/constructors', 3,

+            '%s must be in the private: section' % matched.group(1))

+  else:

+    # Found DISALLOW* macro outside a class declaration, or perhaps it

+    # was used inside a function when it should have been part of the

+    # class declaration.  We could issue a warning here, but it

+    # probably resulted in a compiler error already.

+    pass

+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):

+  """Find the corresponding > to close a template.

+  Args:

+    clean_lines: A CleansedLines instance containing the file.

+    linenum: Current line number.

+    init_suffix: Remainder of the current line after the initial <.

+  Returns:

+    True if a matching bracket exists.

+  """

+  line = init_suffix

+  nesting_stack = ['<']

+  while True:

+    # Find the next operator that can tell us whether < is used as an

+    # opening bracket or as a less-than operator.  We only want to

+    # warn on the latter case.

+    #

+    # We could also check all other operators and terminate the search

+    # early, e.g. if we got something like this "a<b+c", the "<" is

+    # most likely a less-than operator, but then we will get false

+    # positives for default arguments (e.g. http://go/prccd) and

+    # other template expressions (e.g. http://go/oxcjq).

+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)

+    if match:

+      # Found an operator, update nesting stack

+      operator = match.group(1)

+      line = match.group(2)

+      if nesting_stack[-1] == '<':

+        # Expecting closing angle bracket

+        if operator in ('<', '(', '['):

+          nesting_stack.append(operator)

+        elif operator == '>':

+          nesting_stack.pop()

+          if not nesting_stack:

+            # Found matching angle bracket

+            return True

+        elif operator == ',':

+          # Got a comma after a bracket, this is most likely a template

+          # argument.  We have not seen a closing angle bracket yet, but

+          # it's probably a few lines later if we look for it, so just

+          # return early here.

+          return True

+        else:

+          # Got some other operator.

+          return False

+      else:

+        # Expecting closing parenthesis or closing bracket

+        if operator in ('<', '(', '['):

+          nesting_stack.append(operator)

+        elif operator in (')', ']'):

+          # We don't bother checking for matching () or [].  If we got

+          # something like (] or [), it would have been a syntax error.

+          nesting_stack.pop()

+    else:

+      # Scan the next line

+      linenum += 1

+      if linenum >= len(clean_lines.elided):

+        break

+      line = clean_lines.elided[linenum]

+  # Exhausted all remaining lines and still no matching angle bracket.

+  # Most likely the input was incomplete, otherwise we should have

+  # seen a semicolon and returned early.

+  return True

+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):

+  """Find the corresponding < that started a template.

+  Args:

+    clean_lines: A CleansedLines instance containing the file.

+    linenum: Current line number.

+    init_prefix: Part of the current line before the initial >.

+  Returns:

+    True if a matching bracket exists.

+  """

+  line = init_prefix

+  nesting_stack = ['>']

+  while True:

+    # Find the previous operator

+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)

+    if match:

+      # Found an operator, update nesting stack

+      operator = match.group(2)

+      line = match.group(1)

+      if nesting_stack[-1] == '>':

+        # Expecting opening angle bracket

+        if operator in ('>', ')', ']'):

+          nesting_stack.append(operator)

+        elif operator == '<':

+          nesting_stack.pop()

+          if not nesting_stack:

+            # Found matching angle bracket

+            return True

+        elif operator == ',':

+          # Got a comma before a bracket, this is most likely a

+          # template argument.  The opening angle bracket is probably

+          # there if we look for it, so just return early here.

+          return True

+        else:

+          # Got some other operator.

+          return False

+      else:

+        # Expecting opening parenthesis or opening bracket

+        if operator in ('>', ')', ']'):

+          nesting_stack.append(operator)

+        elif operator in ('(', '['):

+          nesting_stack.pop()

+    else:

+      # Scan the previous line

+      linenum -= 1

+      if linenum < 0:

+        break

+      line = clean_lines.elided[linenum]

+  # Exhausted all earlier lines and still no matching angle bracket.

+  return False

+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):

   """Checks for the correctness of various spacing issues in the code.

   Things we check for: spaces around operators, spaces after

@@ -1682,6 +2205,8 @@

     filename: The name of the current file.

     clean_lines: A CleansedLines instance containing the file.

     linenum: The number of the line to check.

+    nesting_state: A _NestingState instance which maintains information about

+                   the current stack of nested blocks being parsed.

     error: The function to call with any errors found.

"""

@@ -1691,7 +2216,16 @@

   # Before nixing comments, check if the line is blank for no good

   # reason.  This includes the first line after a block is opened, and

   # blank lines at the end of a function (ie, right before a line like '}'

-  if IsBlankLine(line):

+  #

+  # Skip all the blank line checks if we are immediately inside a

+  # namespace body.  In other words, don't issue blank line warnings

+  # for this block:

+  #   namespace {

+  #

+  #   }

+  #

+  # A warning about missing end of namespace comments will be issued instead.

+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():

     elided = clean_lines.elided

     prev_line = elided[linenum - 1]

     prevbrace = prev_line.rfind('{')

@@ -1699,8 +2233,7 @@

     #                both start with alnums and are indented the same amount.

     #                This ignores whitespace at the start of a namespace block

     #                because those are not usually indented.

-    if (prevbrace != -1 and prev_line[prevbrace:].find('}') == -1

-        and prev_line[:prevbrace].find('namespace') == -1):

+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:

       # OK, we have a blank line at the start of a code block.  Before we

       # complain, we check if it is an exception to the rule: The previous

       # non-empty line has the parameters of a function header that are indented

@@ -1732,12 +2265,7 @@

       if not exception:

         error(filename, linenum, 'whitespace/blank_line', 2,

               'Blank line at the start of a code block.  Is this needed?')

-    # This doesn't ignore whitespace at the end of a namespace block

-    # because that is too hard without pairing open/close braces;

-    # however, a special exception is made for namespace closing

-    # brackets which have a comment containing "namespace".

-    #

-    # Also, ignore blank lines at the end of a block in a long if-else

+    # Ignore blank lines at the end of a block in a long if-else

     # chain, like this:

     #   if (condition1) {

     #     // Something followed by a blank line

@@ -1749,7 +2277,6 @@

       next_line = raw[linenum + 1]

       if (next_line

           and Match(r'\s*}', next_line)

-          and next_line.find('namespace') == -1

           and next_line.find('} else ') == -1):

         error(filename, linenum, 'whitespace/blank_line', 3,

               'Blank line at the end of a code block.  Is this needed?')

@@ -1810,26 +2337,59 @@

   # though, so we punt on this one for now.  TODO.

   # You should always have whitespace around binary operators.

-  # Alas, we can't test < or > because they're legitimately used sans spaces

-  # (a->b, vector<int> a).  The only time we can tell is a < with no >, and

-  # only if it's not template params list spilling into the next line.

+  #

+  # Check <= and >= first to avoid false positives with < and >, then

+  # check non-include lines for spacing around < and >.

   match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)

-  if not match:

-    # Note that while it seems that the '<[^<]*' term in the following

-    # regexp could be simplified to '<.*', which would indeed match

-    # the same class of strings, the [^<] means that searching for the

-    # regexp takes linear rather than quadratic time.

-    if not Search(r'<[^<]*,\s*$', line):  # template params spill

-      match = Search(r'[^<>=!\s](<)[^<>=!\s]([^>]|->)*$', line)

   if match:

     error(filename, linenum, 'whitespace/operators', 3,

           'Missing spaces around %s' % match.group(1))

-  # We allow no-spaces around << and >> when used like this: 10<<20, but

+  # We allow no-spaces around << when used like this: 10<<20, but

   # not otherwise (particularly, not when used as streams)

-  match = Search(r'[^0-9\s](<<|>>)[^0-9\s]', line)

+  match = Search(r'(\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)

+  if match and not (match.group(1).isdigit() and match.group(2).isdigit()):

+    error(filename, linenum, 'whitespace/operators', 3,

+          'Missing spaces around <<')

+  elif not Match(r'#.*include', line):

+    # Avoid false positives on ->

+    reduced_line = line.replace('->', '')

+    # Look for < that is not surrounded by spaces.  This is only

+    # triggered if both sides are missing spaces, even though

+    # technically should should flag if at least one side is missing a

+    # space.  This is done to avoid some false positives with shifts.

+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)

+    if (match and

+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):

+      error(filename, linenum, 'whitespace/operators', 3,

+            'Missing spaces around <')

+    # Look for > that is not surrounded by spaces.  Similar to the

+    # above, we only trigger if both sides are missing spaces to avoid

+    # false positives with shifts.

+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)

+    if (match and

+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,

+                                             match.group(1))):

+      error(filename, linenum, 'whitespace/operators', 3,

+            'Missing spaces around >')

+  # We allow no-spaces around >> for almost anything.  This is because

+  # C++11 allows ">>" to close nested templates, which accounts for

+  # most cases when ">>" is not followed by a space.

+  #

+  # We still warn on ">>" followed by alpha character, because that is

+  # likely due to ">>" being used for right shifts, e.g.:

+  #   value >> alpha

+  #

+  # When ">>" is used to close templates, the alphanumeric letter that

+  # follows would be part of an identifier, and there should still be

+  # a space separating the template type and the identifier.

+  #   type<type<type>> alpha

+  match = Search(r'>>[a-zA-Z_]', line)

   if match:

     error(filename, linenum, 'whitespace/operators', 3,

-          'Missing spaces around %s' % match.group(1))

+          'Missing spaces around >>')

   # There shouldn't be space around unary operators

   match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)

@@ -1903,18 +2463,25 @@

   # the semicolon there.

   if Search(r':\s*;\s*$', line):

     error(filename, linenum, 'whitespace/semicolon', 5,

-          'Semicolon defining empty statement. Use { } instead.')

+          'Semicolon defining empty statement. Use {} instead.')

   elif Search(r'^\s*;\s*$', line):

     error(filename, linenum, 'whitespace/semicolon', 5,

           'Line contains only semicolon. If this should be an empty statement, '

-          'use { } instead.')

+          'use {} instead.')

   elif (Search(r'\s+;\s*$', line) and

         not Search(r'\bfor\b', line)):

     error(filename, linenum, 'whitespace/semicolon', 5,

           'Extra space before last semicolon. If this should be an empty '

-          'statement, use { } instead.')

+          'statement, use {} instead.')

+  # In range-based for, we wanted spaces before and after the colon, but

+  # not around "::" tokens that might appear.

+  if (Search('for *\(.*[^:]:[^: ]', line) or

+      Search('for *\(.*[^: ]:[^:]', line)):

+    error(filename, linenum, 'whitespace/forcolon', 2,

+          'Missing space around colon in range-based for loop')

 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):

   """Checks for additional blank line issues related to sections.

@@ -1938,8 +2505,8 @@

   # If we didn't find the end of the class, last_line would be zero,

   # and the check will be skipped by the first condition.

-  if (class_info.last_line - class_info.linenum <= 24 or

-      linenum <= class_info.linenum):

+  if (class_info.last_line - class_info.starting_linenum <= 24 or

+      linenum <= class_info.starting_linenum):

     return

   matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])

@@ -1950,15 +2517,18 @@

     #  - We are at the beginning of the class.

     #  - We are forward-declaring an inner class that is semantically

     #    private, but needed to be public for implementation reasons.

+    # Also ignores cases where the previous line ends with a backslash as can be

+    # common when defining classes in C macros.

     prev_line = clean_lines.lines[linenum - 1]

     if (not IsBlankLine(prev_line) and

-        not Search(r'\b(class|struct)\b', prev_line)):

+        not Search(r'\b(class|struct)\b', prev_line) and

+        not Search(r'\\$', prev_line)):

       # Try a bit harder to find the beginning of the class.  This is to

       # account for multi-line base-specifier lists, e.g.:

       #   class Derived

       #       : public Base {

-      end_class_head = class_info.linenum

-      for i in range(class_info.linenum, linenum):

+      end_class_head = class_info.starting_linenum

+      for i in range(class_info.starting_linenum, linenum):

         if Search(r'\{\s*$', clean_lines.lines[i]):

           end_class_head = i

           break

@@ -2008,9 +2578,11 @@

     # which is commonly used to control the lifetime of

     # stack-allocated variables.  We don't detect this perfectly: we

     # just don't complain if the last non-whitespace character on the

-    # previous non-blank line is ';', ':', '{', or '}'.

+    # previous non-blank line is ';', ':', '{', or '}', or if the previous

+    # line starts a preprocessor block.

     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]

-    if not Search(r'[;:}{]\s*$', prevline):

+    if (not Search(r'[;:}{]\s*$', prevline) and

+        not Match(r'\s*#', prevline)):

       error(filename, linenum, 'whitespace/braces', 4,

             '{ should almost always be at the end of the previous line')

@@ -2064,6 +2636,33 @@

           "You don't need a ; after a }")

+def CheckEmptyLoopBody(filename, clean_lines, linenum, error):

+  """Loop for empty loop body with only a single semicolon.

+  Args:

+    filename: The name of the current file.

+    clean_lines: A CleansedLines instance containing the file.

+    linenum: The number of the line to check.

+    error: The function to call with any errors found.

+  """

+  # Search for loop keywords at the beginning of the line.  Because only

+  # whitespaces are allowed before the keywords, this will also ignore most

+  # do-while-loops, since those lines should start with closing brace.

+  line = clean_lines.elided[linenum]

+  if Match(r'\s*(for|while)\s*\(', line):

+    # Find the end of the conditional expression

+    (end_line, end_linenum, end_pos) = CloseExpression(

+        clean_lines, linenum, line.find('('))

+    # Output warning if what follows the condition expression is a semicolon.

+    # No warning for all other cases, including whitespace or newline, since we

+    # have a separate check for semicolons preceded by whitespace.

+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):

+      error(filename, end_linenum, 'whitespace/empty_loop_body', 5,

+            'Empty loop bodies should use {} or continue')

 def ReplaceableCheck(operator, macro, line):

   """Determine whether a basic CHECK can be replaced with a more specific one.

@@ -2132,6 +2731,38 @@

       break

+def CheckAltTokens(filename, clean_lines, linenum, error):

+  """Check alternative keywords being used in boolean expressions.

+  Args:

+    filename: The name of the current file.

+    clean_lines: A CleansedLines instance containing the file.

+    linenum: The number of the line to check.

+    error: The function to call with any errors found.

+  """

+  line = clean_lines.elided[linenum]

+  # Avoid preprocessor lines

+  if Match(r'^\s*#', line):

+    return

+  # Last ditch effort to avoid multi-line comments.  This will not help

+  # if the comment started before the current line or ended after the

+  # current line, but it catches most of the false positives.  At least,

+  # it provides a way to workaround this warning for people who use

+  # multi-line comments in preprocessor macros.

+  #

+  # TODO(unknown): remove this once cpplint has better support for

+  # multi-line comments.

+  if line.find('/*') >= 0 or line.find('*/') >= 0:

+    return

+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):

+    error(filename, linenum, 'readability/alt_tokens', 2,

+          'Use operator %s instead of %s' % (

+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))

 def GetLineWidth(line):

   """Determines the width of the line in column positions.

@@ -2154,7 +2785,7 @@

     return len(line)

-def CheckStyle(filename, clean_lines, linenum, file_extension, class_state,

+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,

                error):

   """Checks rules from the 'C++ style rules' section of cppguide.html.

@@ -2167,6 +2798,8 @@

     clean_lines: A CleansedLines instance containing the file.

     linenum: The number of the line to check.

     file_extension: The extension (without the dot) of the filename.

+    nesting_state: A _NestingState instance which maintains information about

+                   the current stack of nested blocks being parsed.

     error: The function to call with any errors found.

"""

@@ -2248,16 +2881,19 @@

       not ((cleansed_line.find('case ') != -1 or

             cleansed_line.find('default:') != -1) and

            cleansed_line.find('break;') != -1)):

-    error(filename, linenum, 'whitespace/newline', 4,

+    error(filename, linenum, 'whitespace/newline', 0,

           'More than one command on the same line')

   # Some more style checks

   CheckBraces(filename, clean_lines, linenum, error)

-  CheckSpacing(filename, clean_lines, linenum, error)

+  CheckEmptyLoopBody(filename, clean_lines, linenum, error)

+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)

+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)

   CheckCheck(filename, clean_lines, linenum, error)

-  if class_state and class_state.classinfo_stack:

-    CheckSectionSpacing(filename, clean_lines,

-                        class_state.classinfo_stack[-1], linenum, error)

+  CheckAltTokens(filename, clean_lines, linenum, error)

+  classinfo = nesting_state.InnermostClass()

+  if classinfo:

+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)

 _RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')

@@ -2554,9 +3190,11 @@

                      fnline))):

     # We allow non-const references in a few standard places, like functions

-    # called "swap()" or iostream operators like "<<" or ">>".

+    # called "swap()" or iostream operators like "<<" or ">>". We also filter

+    # out for loops, which lint otherwise mistakenly thinks are functions.

     if not Search(

-        r'(swap|Swap|operator[<>][<>])\s*\(\s*(?:[\w:]|<.*>)+\s*&',

+        r'(for|swap|Swap|operator[<>][<>])\s*\(\s*'

+        r'(?:(?:typename\s*)?[\w:]|<.*>)+\s*&',

         fnline):

       error(filename, linenum, 'runtime/references', 2,

             'Is this a non-const reference? '

@@ -2578,10 +3216,19 @@

     if (match.group(1) is None and  # If new operator, then this isn't a cast

         not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or

              Match(r'^\s*MockCallback<.*>', line))):

-      error(filename, linenum, 'readability/casting', 4,

-            'Using deprecated casting style.  '

-            'Use static_cast<%s>(...) instead' %

-            match.group(2))

+      # Try a bit harder to catch gmock lines: the only place where

+      # something looks like an old-style cast is where we declare the

+      # return type of the mocked method, and the only time when we

+      # are missing context is if MOCK_METHOD was split across

+      # multiple lines (for example http://go/hrfhr ), so we only need

+      # to check the previous line for MOCK_METHOD.

+      if (linenum == 0 or

+          not Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(\S+,\s*$',

+                    clean_lines.elided[linenum - 1])):

+        error(filename, linenum, 'readability/casting', 4,

+              'Using deprecated casting style.  '

+              'Use static_cast<%s>(...) instead' %

+              match.group(2))

   CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],

                   'static_cast',

@@ -2703,7 +3350,7 @@

   printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')

   if printf_args:

     match = Match(r'([\w.\->()]+)$', printf_args)

-    if match:

+    if match and match.group(1) != '__VA_ARGS__':

       function_name = re.search(r'\b((?:string)?printf)\s*\(',

                                 line, re.I).group(1)

       error(filename, linenum, 'runtime/printf', 4,

@@ -2824,6 +3471,11 @@

           'Using sizeof(type).  Use sizeof(varname) instead if possible')

     return True

+  # operator++(int) and operator--(int)

+  if (line[0:match.start(1) - 1].endswith(' operator++') or

+      line[0:match.start(1) - 1].endswith(' operator--')):

+    return False

   remainder = line[match.end(0):]

   # The close paren is for function pointers as arguments to a function.

@@ -3112,13 +3764,13 @@

   if match:

     error(filename, linenum, 'build/explicit_make_pair',

           4,  # 4 = high confidence

-          'Omit template arguments from make_pair OR use pair directly OR'

-          ' if appropriate, construct a pair directly')

+          'For C++11-compatibility, omit template arguments from make_pair'

+          ' OR use pair directly OR if appropriate, construct a pair directly')

-def ProcessLine(filename, file_extension,

-                clean_lines, line, include_state, function_state,

-                class_state, error, extra_check_functions=[]):

+def ProcessLine(filename, file_extension, clean_lines, line,

+                include_state, function_state, nesting_state, error,

+                extra_check_functions=[]):

   """Processes a single line in the file.

   Args:

@@ -3129,8 +3781,8 @@

     line: Number of line being processed.

     include_state: An _IncludeState instance in which the headers are inserted.

     function_state: A _FunctionState instance which counts function lines, etc.

-    class_state: A _ClassState instance which maintains information about

-                 the current stack of nested class declarations being parsed.

+    nesting_state: A _NestingState instance which maintains information about

+                   the current stack of nested blocks being parsed.

     error: A callable to which errors are reported, which takes 4 arguments:

            filename, line number, error level, and message

     extra_check_functions: An array of additional check functions that will be

@@ -3139,13 +3791,16 @@

"""

   raw_lines = clean_lines.raw_lines

   ParseNolintSuppressions(filename, raw_lines[line], line, error)

+  nesting_state.Update(filename, clean_lines, line, error)

+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:

+    return

   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)

   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)

-  CheckStyle(filename, clean_lines, line, file_extension, class_state, error)

+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)

   CheckLanguage(filename, clean_lines, line, file_extension, include_state,

                 error)

   CheckForNonStandardConstructs(filename, clean_lines, line,

-                                class_state, error)

+                                nesting_state, error)

   CheckPosixThreading(filename, clean_lines, line, error)

   CheckInvalidIncrement(filename, clean_lines, line, error)

   CheckMakePairUsesDeduction(filename, clean_lines, line, error)

@@ -3172,7 +3827,7 @@

   include_state = _IncludeState()

   function_state = _FunctionState()

-  class_state = _ClassState()

+  nesting_state = _NestingState()

   ResetNolintSuppressions()

@@ -3185,9 +3840,9 @@

   clean_lines = CleansedLines(lines)

   for line in xrange(clean_lines.NumLines()):

     ProcessLine(filename, file_extension, clean_lines, line,

-                include_state, function_state, class_state, error,

+                include_state, function_state, nesting_state, error,

                 extra_check_functions)

-  class_state.CheckFinished(filename, error)

+  nesting_state.CheckClassFinished(filename, error)

   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)

@@ -3301,7 +3956,8 @@

   try:

     (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',

                                                  'counting=',

-                                                 'filter='])

+                                                 'filter=',

+                                                 'root='])

   except getopt.GetoptError:

     PrintUsage('Invalid arguments.')

@@ -3327,6 +3983,9 @@

       if val not in ('total', 'toplevel', 'detailed'):

         PrintUsage('Valid counting options are total, toplevel, and detailed')

       counting_style = val

+    elif opt == '--root':

+      global _root

+      _root = val

   if not filenames:

     PrintUsage('No files were specified.')

--- a/vp8/encoder/arm/neon/shortfdct_neon.asm

+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm

@@ -97,7 +97,7 @@

     vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000

     vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000

-    vmvn.s16        d4, d4

+    vmvn            d4, d4

     vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16

     vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)

     vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16

@@ -200,7 +200,7 @@

     vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000

     vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000

-    vmvn.s16        q14, q14

+    vmvn            q14, q14

     vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16

     vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -2755,7 +2755,7 @@

     /* Clear the alternate reference update pending flag. */

     cpi->source_alt_ref_pending = 0;

-    /* Set the alternate refernce frame active flag */

+    /* Set the alternate reference frame active flag */

     cpi->source_alt_ref_active = 1;

@@ -3402,7 +3402,7 @@

     else

         cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;

-    /* Check to see if a key frame is signalled

+    /* Check to see if a key frame is signaled

      * For two pass with auto key frame enabled cm->frame_type may already

      * be set, but not for one pass.

*/

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -91,18 +91,8 @@

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c

-# TODO(johann) make this generic

-ifeq ($(HAVE_SSE2),yes)

-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2

-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2

-endif

 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c

-ifeq ($(HAVE_SSE2),yes)

-vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2

-vp8/encoder/x86/denoising_sse2.c.d: CFLAGS += -msse2

-endif

 endif

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm

--- a/vp9/common/ppc/vp9_copy_altivec.asm

+++ /dev/null

@@ -1,47 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl copy_mem16x16_ppc

-;# r3 unsigned char *src

-;# r4 int src_stride

-;# r5 unsigned char *dst

-;# r6 int dst_stride

-;# Make the assumption that input will not be aligned,

-;#  but the output will be.  So two reads and a perm

-;#  for the input, but only one store for the output.

-copy_mem16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xe000

-    mtspr   256, r12            ;# set VRSAVE

-    li      r10, 16

-    mtctr   r10

-cp_16x16_loop:

-    lvsl    v0,  0, r3          ;# permutate value for alignment

-    lvx     v1,   0, r3

-    lvx     v2, r10, r3

-    vperm   v1, v1, v2, v0

-    stvx    v1,  0, r5

-    add     r3, r3, r4          ;# increment source pointer

-    add     r5, r5, r6          ;# increment destination pointer

-    bdnz    cp_16x16_loop

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

--- a/vp9/common/ppc/vp9_filter_altivec.asm

+++ /dev/null

@@ -1,1013 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl sixtap_predict_ppc

-    .globl sixtap_predict8x4_ppc

-    .globl sixtap_predict8x8_ppc

-    .globl sixtap_predict16x16_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-.macro load_hfilter V0, V1

-    load_c \V0, HFilter, r5, r9, r10

-    addi    r5,  r5, 16

-    lvx     \V1, r5, r10

-.endm

-;# Vertical filtering

-.macro Vprolog

-    load_c v0, VFilter, r6, r3, r10

-    vspltish v5, 8

-    vspltish v6, 3

-    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v1, v0, 1

-    vspltb  v2, v0, 2

-    vspltb  v3, v0, 3

-    vspltb  v4, v0, 4

-    vspltb  v5, v0, 5

-    vspltb  v0, v0, 0

-.endm

-.macro vpre_load

-    Vprolog

-    li      r10,  16

-    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows

-    lvx     v11, r10, r9

-    addi    r9,   r9, 32

-    lvx     v12,   0, r9

-    lvx     v13, r10, r9

-    addi    r9,   r9, 32

-    lvx     v14,   0, r9

-.endm

-.macro Msum Re, Ro, V, T, TMP

-                                ;# (Re,Ro) += (V*T)

-    vmuleub \TMP, \V, \T        ;# trashes v8

-    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary

-    vmuloub \TMP, \V, \T

-    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds

-.endm

-.macro vinterp_no_store P0 P1 P2 P3 P4 P5

-    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps

-    vadduhm v16, v6, v8

-    vmuloub  v8, \P0, v0

-    vadduhm v17, v6, v8

-    Msum v16, v17, \P2, v2, v8

-    Msum v16, v17, \P3, v3, v8

-    Msum v16, v17, \P5, v5, v8

-    vmuleub v18, \P1, v1        ;# 2 negative taps

-    vmuloub v19, \P1, v1

-    Msum v18, v19, \P4, v4, v8

-    vsubuhs v16, v16, v18       ;# subtract neg from pos

-    vsubuhs v17, v17, v19

-    vsrh    v16, v16, v7        ;# divide by 128

-    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds

-    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order

-    vmrglh  v19, v16, v17

-    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result

-.endm

-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5

-    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps

-    vadduhm v21, v20, v24

-    vmuloub v24, \P0, v13

-    vadduhm v22, v20, v24

-    Msum v21, v22, \P2, v15, v25

-    Msum v21, v22, \P3, v16, v25

-    Msum v21, v22, \P5, v18, v25

-    vmuleub v23, \P1, v14       ;# 2 negative taps

-    vmuloub v24, \P1, v14

-    Msum v23, v24, \P4, v17, v25

-    vsubuhs v21, v21, v23       ;# subtract neg from pos

-    vsubuhs v22, v22, v24

-    vsrh    v21, v21, v19       ;# divide by 128

-    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds

-    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order

-    vmrglh  v24, v21, v22

-    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result

-.endm

-.macro Vinterp P0 P1 P2 P3 P4 P5

-    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5

-    stvx    \P0, 0, r7

-    add     r7, r7, r8      ;# 33 ops per 16 pels

-.endm

-.macro luma_v P0, P1, P2, P3, P4, P5

-    addi    r9,   r9, 16        ;# P5 = newest input row

-    lvx     \P5,   0, r9

-    Vinterp \P0, \P1, \P2, \P3, \P4, \P5

-.endm

-.macro luma_vtwo

-    luma_v v10, v11, v12, v13, v14, v15

-    luma_v v11, v12, v13, v14, v15, v10

-.endm

-.macro luma_vfour

-    luma_vtwo

-    luma_v v12, v13, v14, v15, v10, v11

-    luma_v v13, v14, v15, v10, v11, v12

-.endm

-.macro luma_vsix

-    luma_vfour

-    luma_v v14, v15, v10, v11, v12, v13

-    luma_v v15, v10, v11, v12, v13, v14

-.endm

-.macro Interp4 R I I4

-    vmsummbm \R, v13, \I, v15

-    vmsummbm \R, v14, \I4, \R

-.endm

-.macro Read8x8 VD, RS, RP, increment_counter

-    lvsl    v21,  0, \RS        ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     \VD,   0, \RS

-    lvx     v20, r10, \RS

-.if \increment_counter

-    add     \RS, \RS, \RP

-.endif

-    vperm   \VD, \VD, v20, v21

-.endm

-.macro interp_8x8 R

-    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456

-    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A

-    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3

-    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx

-    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7

-    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7

-    vsrh    \R, \R, v19

-    vpkuhus \R, \R, \R          ;# saturate and pack

-.endm

-.macro Read4x4 VD, RS, RP, increment_counter

-    lvsl    v21,  0, \RS        ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v20,   0, \RS

-.if \increment_counter

-    add     \RS, \RS, \RP

-.endif

-    vperm   \VD, v20, v20, v21

-.endm

-    .text

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-sixtap_predict_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xff87

-    ori     r12, r12, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq-    vertical_only_4x4

-    ;# load up horizontal filter

-    load_hfilter v13, v14

-    ;# rounding added in on the multiply

-    vspltisw v16, 8

-    vspltisw v15, 3

-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

-    ;# Load up permutation constants

-    load_c v16, B_0123, 0, r9, r10

-    load_c v17, B_4567, 0, r9, r10

-    load_c v18, B_89AB, 0, r9, r10

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    addi    r9, r3, 0

-    li      r10, 16

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# filter a line

-    interp_8x8 v2

-    interp_8x8 v3

-    interp_8x8 v4

-    interp_8x8 v5

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional 5 lines that are needed

-    ;#  for the vertical filter.

-    beq-    store_4x4

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r9, r9, r4

-    sub     r9, r9, r4

-    Read8x8 v0, r9, r4, 1

-    Read8x8 v1, r9, r4, 0

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 0

-    interp_8x8 v0

-    interp_8x8 v1

-    interp_8x8 v6

-    interp_8x8 v7

-    interp_8x8 v8

-    b       second_pass_4x4

-vertical_only_4x4:

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    li      r10, 16

-    Read8x8 v0, r3, r4, 1

-    Read8x8 v1, r3, r4, 1

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 0

-    slwi    r6, r6, 4           ;# index into vertical filter array

-second_pass_4x4:

-    load_c   v20, b_hilo_4x4, 0, r9, r10

-    load_c   v21, b_hilo, 0, r9, r10

-    ;# reposition input so that it can go through the

-    ;# filtering phase with one pass.

-    vperm   v0, v0, v1, v20     ;# 0 1 x x

-    vperm   v2, v2, v3, v20     ;# 2 3 x x

-    vperm   v4, v4, v5, v20     ;# 4 5 x x

-    vperm   v6, v6, v7, v20     ;# 6 7 x x

-    vperm   v0, v0, v2, v21     ;# 0 1 2 3

-    vperm   v4, v4, v6, v21     ;# 4 5 6 7

-    vsldoi  v1, v0, v4, 4

-    vsldoi  v2, v0, v4, 8

-    vsldoi  v3, v0, v4, 12

-    vsldoi  v5, v4, v8, 4

-    load_c   v13, VFilter, r6, r9, r10

-    vspltish v15, 8

-    vspltish v20, 3

-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v14, v13, 1

-    vspltb  v15, v13, 2

-    vspltb  v16, v13, 3

-    vspltb  v17, v13, 4

-    vspltb  v18, v13, 5

-    vspltb  v13, v13, 0

-    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5

-    stvx    v0, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    lwz     r0, 4(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    lwz     r0, 8(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    lwz     r0, 12(r1)

-    stw     r0, 0(r7)

-    b       exit_4x4

-store_4x4:

-    stvx    v2, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v3, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v4, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v5, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-exit_4x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro w_8x8 V, D, R, P

-    stvx    \V, 0, r1

-    lwz     \R, 0(r1)

-    stw     \R, 0(r7)

-    lwz     \R, 4(r1)

-    stw     \R, 4(r7)

-    add     \D, \D, \P

-.endm

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-sixtap_predict8x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq-    second_pass_pre_copy_8x4

-    load_hfilter v13, v14

-    ;# rounding added in on the multiply

-    vspltisw v16, 8

-    vspltisw v15, 3

-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

-    ;# Load up permutation constants

-    load_c v16, B_0123, 0, r9, r10

-    load_c v17, B_4567, 0, r9, r10

-    load_c v18, B_89AB, 0, r9, r10

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    addi    r9, r3, 0

-    li      r10, 16

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# filter a line

-    interp_8x8 v2

-    interp_8x8 v3

-    interp_8x8 v4

-    interp_8x8 v5

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional 5 lines that are needed

-    ;#  for the vertical filter.

-    beq-    store_8x4

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r9, r9, r4

-    sub     r9, r9, r4

-    Read8x8 v0, r9, r4, 1

-    Read8x8 v1, r9, r4, 0

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 0

-    interp_8x8 v0

-    interp_8x8 v1

-    interp_8x8 v6

-    interp_8x8 v7

-    interp_8x8 v8

-    b       second_pass_8x4

-second_pass_pre_copy_8x4:

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    li      r10, 16

-    Read8x8 v0,  r3, r4, 1

-    Read8x8 v1,  r3, r4, 1

-    Read8x8 v2,  r3, r4, 1

-    Read8x8 v3,  r3, r4, 1

-    Read8x8 v4,  r3, r4, 1

-    Read8x8 v5,  r3, r4, 1

-    Read8x8 v6,  r3, r4, 1

-    Read8x8 v7,  r3, r4, 1

-    Read8x8 v8,  r3, r4, 1

-    slwi    r6, r6, 4           ;# index into vertical filter array

-second_pass_8x4:

-    load_c v13, VFilter, r6, r9, r10

-    vspltish v15, 8

-    vspltish v20, 3

-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v14, v13, 1

-    vspltb  v15, v13, 2

-    vspltb  v16, v13, 3

-    vspltb  v17, v13, 4

-    vspltb  v18, v13, 5

-    vspltb  v13, v13, 0

-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5

-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6

-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7

-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x4

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    b       exit_8x4

-store_aligned_8x4:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-    b       exit_8x4

-store_8x4:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned2_8x4

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    b       exit_8x4

-store_aligned2_8x4:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-exit_8x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-;# Because the width that needs to be filtered will fit in a single altivec

-;#  register there is no need to loop.  Everything can stay in registers.

-sixtap_predict8x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq-    second_pass_pre_copy_8x8

-    load_hfilter v13, v14

-    ;# rounding added in on the multiply

-    vspltisw v16, 8

-    vspltisw v15, 3

-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

-    ;# Load up permutation constants

-    load_c v16, B_0123, 0, r9, r10

-    load_c v17, B_4567, 0, r9, r10

-    load_c v18, B_89AB, 0, r9, r10

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    addi    r9, r3, 0

-    li      r10, 16

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 1

-    Read8x8 v9, r3, r4, 1

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# filter a line

-    interp_8x8 v2

-    interp_8x8 v3

-    interp_8x8 v4

-    interp_8x8 v5

-    interp_8x8 v6

-    interp_8x8 v7

-    interp_8x8 v8

-    interp_8x8 v9

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional 5 lines that are needed

-    ;#  for the vertical filter.

-    beq-    store_8x8

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r9, r9, r4

-    sub     r9, r9, r4

-    Read8x8 v0,  r9, r4, 1

-    Read8x8 v1,  r9, r4, 0

-    Read8x8 v10, r3, r4, 1

-    Read8x8 v11, r3, r4, 1

-    Read8x8 v12, r3, r4, 0

-    interp_8x8 v0

-    interp_8x8 v1

-    interp_8x8 v10

-    interp_8x8 v11

-    interp_8x8 v12

-    b       second_pass_8x8

-second_pass_pre_copy_8x8:

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    li      r10, 16

-    Read8x8 v0,  r3, r4, 1

-    Read8x8 v1,  r3, r4, 1

-    Read8x8 v2,  r3, r4, 1

-    Read8x8 v3,  r3, r4, 1

-    Read8x8 v4,  r3, r4, 1

-    Read8x8 v5,  r3, r4, 1

-    Read8x8 v6,  r3, r4, 1

-    Read8x8 v7,  r3, r4, 1

-    Read8x8 v8,  r3, r4, 1

-    Read8x8 v9,  r3, r4, 1

-    Read8x8 v10, r3, r4, 1

-    Read8x8 v11, r3, r4, 1

-    Read8x8 v12, r3, r4, 0

-    slwi    r6, r6, 4           ;# index into vertical filter array

-second_pass_8x8:

-    load_c v13, VFilter, r6, r9, r10

-    vspltish v15, 8

-    vspltish v20, 3

-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v14, v13, 1

-    vspltb  v15, v13, 2

-    vspltb  v16, v13, 3

-    vspltb  v17, v13, 4

-    vspltb  v18, v13, 5

-    vspltb  v13, v13, 0

-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5

-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6

-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7

-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8

-    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9

-    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10

-    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11

-    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x8

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    w_8x8   v6, r7, r0, r8

-    w_8x8   v7, r7, r0, r8

-    b       exit_8x8

-store_aligned_8x8:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    vperm   v6, v6, v7, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-    addi    r7, r7, 16

-    stvx    v6, 0, r7

-    b       exit_8x8

-store_8x8:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned2_8x8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    w_8x8   v6, r7, r0, r8

-    w_8x8   v7, r7, r0, r8

-    w_8x8   v8, r7, r0, r8

-    w_8x8   v9, r7, r0, r8

-    b       exit_8x8

-store_aligned2_8x8:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    vperm   v6, v6, v7, v10

-    vperm   v8, v8, v9, v10

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-    addi    r7, r7, 16

-    stvx    v6, 0, r7

-    addi    r7, r7, 16

-    stvx    v8, 0, r7

-exit_8x8:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical

-;#  edges.  One of the filters can be null, but both won't be.  Needs to use a

-;#  temporary buffer because the source buffer can't be modified and the buffer

-;#  for the destination is not large enough to hold the temporary data.

-sixtap_predict16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xf000

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-416(r1)         ;# create space on the stack

-    ;# Three possiblities

-    ;#  1. First filter is null.  Don't use a temp buffer.

-    ;#  2. Second filter is null.  Don't use a temp buffer.

-    ;#  3. Neither are null, use temp buffer.

-    ;# First Pass (horizontal edge)

-    ;#  setup pointers for src

-    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump

-    ;#  to second pass.  this is based on if x_offset is 0.

-    ;# load up horizontal filter

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    load_hfilter v4, v5

-    beq-    copy_horizontal_16x21

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# setup constants

-    ;# v14 permutation value for alignment

-    load_c v14, b_hperm, 0, r9, r10

-    ;# These statements are guessing that there won't be a second pass,

-    ;#  but if there is then inside the bypass they need to be set

-    li      r0, 16              ;# prepare for no vertical filter

-    ;# Change the output pointer and pitch to be the actual

-    ;#  desination instead of a temporary buffer.

-    addi    r9, r7, 0

-    addi    r5, r8, 0

-    ;# no vertical filter, so write the output from the first pass

-    ;#  directly into the output buffer.

-    beq-    no_vertical_filter_bypass

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    ;# setup counter for the number of lines that are going to be filtered

-    li      r0, 21

-    ;# use the stack as temporary storage

-    la      r9, 48(r1)

-    li      r5, 16

-no_vertical_filter_bypass:

-    mtctr   r0

-    ;# rounding added in on the multiply

-    vspltisw v10, 8

-    vspltisw v12, 3

-    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v13, 7

-    ;# index to the next set of vectors in the row.

-    li      r10, 16

-    li      r12, 32

-horizontal_loop_16x16:

-    lvsl    v15,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v1,   0, r3

-    lvx     v2, r10, r3

-    lvx     v3, r12, r3

-    vperm   v8, v1, v2, v15

-    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified

-    vsldoi  v11, v8, v9, 4

-    ;# set 0

-    vmsummbm v6, v4, v8, v12    ;# taps times elements

-    vmsummbm v0, v5, v11, v6

-    ;# set 1

-    vsldoi  v10, v8, v9, 1

-    vsldoi  v11, v8, v9, 5

-    vmsummbm v6, v4, v10, v12

-    vmsummbm v1, v5, v11, v6

-    ;# set 2

-    vsldoi  v10, v8, v9, 2

-    vsldoi  v11, v8, v9, 6

-    vmsummbm v6, v4, v10, v12

-    vmsummbm v2, v5, v11, v6

-    ;# set 3

-    vsldoi  v10, v8, v9, 3

-    vsldoi  v11, v8, v9, 7

-    vmsummbm v6, v4, v10, v12

-    vmsummbm v3, v5, v11, v6

-    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)

-    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F

-    vsrh    v0, v0, v13         ;# divide v0, v1 by 128

-    vsrh    v1, v1, v13

-    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result

-    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result

-    stvx    v0,  0, r9

-    add     r9, r9, r5

-    add     r3, r3, r4

-    bdnz    horizontal_loop_16x16

-    ;# check again to see if vertical filter needs to be done.

-    cmpi    cr0, r6, 0

-    beq     cr0, end_16x16

-    ;# yes there is, so go to the second pass

-    b       second_pass_16x16

-copy_horizontal_16x21:

-    li      r10, 21

-    mtctr   r10

-    li      r10, 16

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    ;# this is done above if there is a horizontal filter,

-    ;#  if not it needs to be done down here.

-    slwi    r6, r6, 4           ;# index into vertical filter array

-    ;# always write to the stack when doing a horizontal copy

-    la      r9, 48(r1)

-copy_horizontal_loop_16x21:

-    lvsl    v15,  0, r3         ;# permutate value for alignment

-    lvx     v1,   0, r3

-    lvx     v2, r10, r3

-    vperm   v8, v1, v2, v15

-    stvx    v8,  0, r9

-    addi    r9, r9, 16

-    add     r3, r3, r4

-    bdnz    copy_horizontal_loop_16x21

-second_pass_16x16:

-    ;# always read from the stack when doing a vertical filter

-    la      r9, 48(r1)

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v7, 7

-    vpre_load

-    luma_vsix

-    luma_vsix

-    luma_vfour

-end_16x16:

-    addi    r1, r1, 416         ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-    .align 4

-HFilter:

-    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0

-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12

-    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0

-    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36

-    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0

-    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50

-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0

-    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77

-    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0

-    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93

-    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0

-    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108

-    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0

-    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123

-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0

-    .align 4

-VFilter:

-    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .align 4

-b_hperm:

-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

-    .align 4

-B_0123:

-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

-    .align 4

-B_4567:

-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

-    .align 4

-B_89AB:

-    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14

-    .align 4

-b_hilo:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

-    .align 4

-b_hilo_4x4:

-    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0

--- a/vp9/common/ppc/vp9_filter_bilinear_altivec.asm

+++ /dev/null

@@ -1,677 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl bilinear_predict4x4_ppc

-    .globl bilinear_predict8x4_ppc

-    .globl bilinear_predict8x8_ppc

-    .globl bilinear_predict16x16_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-.macro load_vfilter V0, V1

-    load_c \V0, vfilter_b, r6, r9, r10

-    addi    r6,  r6, 16

-    lvx     \V1, r6, r10

-.endm

-.macro HProlog jump_label

-    ;# load up horizontal filter

-    slwi.   r5, r5, 4           ;# index into horizontal filter array

-    ;# index to the next set of vectors in the row.

-    li      r10, 16

-    li      r12, 32

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq     \jump_label

-    load_c v20, hfilter_b, r5, r9, r0

-    ;# setup constants

-    ;# v14 permutation value for alignment

-    load_c v28, b_hperm_b, 0, r9, r0

-    ;# rounding added in on the multiply

-    vspltisw v21, 8

-    vspltisw v18, 3

-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-.endm

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro HFilter V

-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456

-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A

-    vmsummbm v24, v20, v24, v18

-    vmsummbm v25, v20, v25, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result

-.endm

-.macro hfilter_8 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 9 bytes wide, output is 8 bytes.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    HFilter \V

-.endm

-.macro load_and_align_8 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   \V, v21, v22, v17

-.endm

-.macro write_aligned_8 V, increment_counter

-    stvx    \V,  0, r7

-.if \increment_counter

-    add     r7, r7, r8

-.endif

-.endm

-.macro vfilter_16 P0 P1

-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps

-    vadduhm v22, v18, v22

-    vmuloub v23, \P0, v20

-    vadduhm v23, v18, v23

-    vmuleub v24, \P1, v21

-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary

-    vmuloub v25, \P1, v21

-    vadduhm v23, v23, v25       ;# Ro = odds

-    vsrh    v22, v22, v19       ;# divide by 128

-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds

-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order

-    vmrglh  v23, v22, v23

-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result

-.endm

-.macro w_8x8 V, D, R, P

-    stvx    \V, 0, r1

-    lwz     \R, 0(r1)

-    stw     \R, 0(r7)

-    lwz     \R, 4(r1)

-    stw     \R, 4(r7)

-    add     \D, \D, \P

-.endm

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict4x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf830

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_4x4_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r9, r12

-    load_c v11, b_4567_b, 0, r9, r12

-    hfilter_8 v0, 1

-    hfilter_8 v1, 1

-    hfilter_8 v2, 1

-    hfilter_8 v3, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_4x4_b

-    hfilter_8 v4, 0

-    b   second_pass_4x4_b

-second_pass_4x4_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_8  v0, 1

-    load_and_align_8  v1, 1

-    load_and_align_8  v2, 1

-    load_and_align_8  v3, 1

-    load_and_align_8  v4, 1

-second_pass_4x4_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-store_out_4x4_b:

-    stvx    v0, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v1, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v2, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v3, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-exit_4x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict8x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf830

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x4_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r9, r12

-    load_c v11, b_4567_b, 0, r9, r12

-    hfilter_8 v0, 1

-    hfilter_8 v1, 1

-    hfilter_8 v2, 1

-    hfilter_8 v3, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_8x4_b

-    hfilter_8 v4, 0

-    b   second_pass_8x4_b

-second_pass_8x4_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_8  v0, 1

-    load_and_align_8  v1, 1

-    load_and_align_8  v2, 1

-    load_and_align_8  v3, 1

-    load_and_align_8  v4, 1

-second_pass_8x4_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-store_out_8x4_b:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x4_b

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    b       exit_8x4

-store_aligned_8x4_b:

-    load_c v10, b_hilo_b, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-exit_8x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict8x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff0

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x8_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r9, r12

-    load_c v11, b_4567_b, 0, r9, r12

-    hfilter_8 v0, 1

-    hfilter_8 v1, 1

-    hfilter_8 v2, 1

-    hfilter_8 v3, 1

-    hfilter_8 v4, 1

-    hfilter_8 v5, 1

-    hfilter_8 v6, 1

-    hfilter_8 v7, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_8x8_b

-    hfilter_8 v8, 0

-    b   second_pass_8x8_b

-second_pass_8x8_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_8  v0, 1

-    load_and_align_8  v1, 1

-    load_and_align_8  v2, 1

-    load_and_align_8  v3, 1

-    load_and_align_8  v4, 1

-    load_and_align_8  v5, 1

-    load_and_align_8  v6, 1

-    load_and_align_8  v7, 1

-    load_and_align_8  v8, 0

-second_pass_8x8_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-store_out_8x8_b:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x8_b

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    w_8x8   v6, r7, r0, r8

-    w_8x8   v7, r7, r0, r8

-    b       exit_8x8

-store_aligned_8x8_b:

-    load_c v10, b_hilo_b, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    vperm   v6, v6, v7, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-    addi    r7, r7, 16

-    stvx    v6, 0, r7

-exit_8x8:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro hfilter_16 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-    lvx     v23, r12, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified

-    ;# set 0

-    vmsummbm v24, v20, v21, v18 ;# taps times elements

-    ;# set 1

-    vsldoi  v23, v21, v22, 1

-    vmsummbm v25, v20, v23, v18

-    ;# set 2

-    vsldoi  v23, v21, v22, 2

-    vmsummbm v26, v20, v23, v18

-    ;# set 3

-    vsldoi  v23, v21, v22, 3

-    vmsummbm v27, v20, v23, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vsrh    v25, v25, v19

-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result

-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result

-.endm

-.macro load_and_align_16 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   \V, v21, v22, v17

-.endm

-.macro write_16 V, increment_counter

-    stvx    \V,  0, r7

-.if \increment_counter

-    add     r7, r7, r8

-.endif

-.endm

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    HProlog second_pass_16x16_pre_copy_b

-    hfilter_16 v0,  1

-    hfilter_16 v1,  1

-    hfilter_16 v2,  1

-    hfilter_16 v3,  1

-    hfilter_16 v4,  1

-    hfilter_16 v5,  1

-    hfilter_16 v6,  1

-    hfilter_16 v7,  1

-    hfilter_16 v8,  1

-    hfilter_16 v9,  1

-    hfilter_16 v10, 1

-    hfilter_16 v11, 1

-    hfilter_16 v12, 1

-    hfilter_16 v13, 1

-    hfilter_16 v14, 1

-    hfilter_16 v15, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_16x16_b

-    hfilter_16 v16, 0

-    b   second_pass_16x16_b

-second_pass_16x16_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16  v0,  1

-    load_and_align_16  v1,  1

-    load_and_align_16  v2,  1

-    load_and_align_16  v3,  1

-    load_and_align_16  v4,  1

-    load_and_align_16  v5,  1

-    load_and_align_16  v6,  1

-    load_and_align_16  v7,  1

-    load_and_align_16  v8,  1

-    load_and_align_16  v9,  1

-    load_and_align_16  v10, 1

-    load_and_align_16  v11, 1

-    load_and_align_16  v12, 1

-    load_and_align_16  v13, 1

-    load_and_align_16  v14, 1

-    load_and_align_16  v15, 1

-    load_and_align_16  v16, 0

-second_pass_16x16_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-    vfilter_16 v8,  v9

-    vfilter_16 v9,  v10

-    vfilter_16 v10, v11

-    vfilter_16 v11, v12

-    vfilter_16 v12, v13

-    vfilter_16 v13, v14

-    vfilter_16 v14, v15

-    vfilter_16 v15, v16

-store_out_16x16_b:

-    write_16 v0,  1

-    write_16 v1,  1

-    write_16 v2,  1

-    write_16 v3,  1

-    write_16 v4,  1

-    write_16 v5,  1

-    write_16 v6,  1

-    write_16 v7,  1

-    write_16 v8,  1

-    write_16 v9,  1

-    write_16 v10, 1

-    write_16 v11, 1

-    write_16 v12, 1

-    write_16 v13, 1

-    write_16 v14, 1

-    write_16 v15, 0

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-    .align 4

-hfilter_b:

-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0

-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0

-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0

-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0

-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0

-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0

-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0

-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0

-    .align 4

-vfilter_b:

-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128

-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .align 4

-b_hperm_b:

-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

-    .align 4

-b_0123_b:

-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

-    .align 4

-b_4567_b:

-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

-b_hilo_b:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- a/vp9/common/ppc/vp9_idct_altivec.asm

+++ /dev/null

@@ -1,189 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl short_idct4x4_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-    .align 2

-short_idct4x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    load_c v8, sinpi8sqrt2, 0, r9, r10

-    load_c v9, cospi8sqrt2minus1, 0, r9, r10

-    load_c v10, hi_hi, 0, r9, r10

-    load_c v11, lo_lo, 0, r9, r10

-    load_c v12, shift_16, 0, r9, r10

-    li      r10,  16

-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]

-    lvx     v1, r10, r3         ;# input ip[8], ip[12]

-    ;# first pass

-    vupkhsh v2, v0

-    vupkhsh v3, v1

-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

-    vupklsh v0, v0

-    vmulosh v4, v0, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vupklsh v1, v1

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v3, v1, v8

-    vsraw   v3, v3, v12

-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v0, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v0

-    vaddsws v3, v3, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    ;# transpose input

-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1

-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3

-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0

-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2

-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

-    ;# second pass

-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

-    vmulosh v4, v1, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v3, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v3

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v2, v3, v8

-    vsraw   v2, v2, v12

-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vaddsws v3, v2, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    vspltish v6, 4

-    vspltish v7, 3

-    vpkswss v0, v0, v1

-    vpkswss v1, v2, v3

-    vaddshs v0, v0, v6

-    vaddshs v1, v1, v6

-    vsrah   v0, v0, v7

-    vsrah   v1, v1, v7

-    ;# transpose output

-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3

-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1

-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

-    stwu    r1,-416(r1)         ;# create space on the stack

-    stvx    v0,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    add     r4, r4, r5

-    stvx    v1,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    addi    r1, r1, 416         ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 4

-sinpi8sqrt2:

-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

-    .align 4

-cospi8sqrt2minus1:

-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

-    .align 4

-shift_16:

-    .long      16,    16,    16,    16

-    .align 4

-hi_hi:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

-    .align 4

-lo_lo:

-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

--- a/vp9/common/ppc/vp9_loopfilter_altivec.c

+++ /dev/null

@@ -1,127 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9/common/vp9_loopfilter.h"

-#include "vp9/common/vp9_onyxc_int.h"

-typedef void loop_filter_function_y_ppc

-(

-  unsigned char *s,   // source pointer

-  int p,              // pitch

-  const signed char *flimit,

-  const signed char *limit,

-  const signed char *thresh

-);

-typedef void loop_filter_function_uv_ppc

-(

-  unsigned char *u,   // source pointer

-  unsigned char *v,   // source pointer

-  int p,              // pitch

-  const signed char *flimit,

-  const signed char *limit,

-  const signed char *thresh

-);

-typedef void loop_filter_function_s_ppc

-(

-  unsigned char *s,   // source pointer

-  int p,              // pitch

-  const signed char *flimit

-);

-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;

-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;

-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;

-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;

-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;

-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;

-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;

-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;

-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;

-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;

-// Horizontal MB filtering

-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);

-}

-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                          int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);

-}

-// Vertical MB Filtering

-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);

-}

-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                          int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);

-}

-// Horizontal B Filtering

-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                        int y_stride, int uv_stride, loop_filter_info *lfi) {

-  // These should all be done at once with one call, instead of 3

-  loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);

-}

-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);

-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);

-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);

-}

-// Vertical B Filtering

-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                        int y_stride, int uv_stride, loop_filter_info *lfi) {

-  loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);

-}

-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);

-  loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);

-  loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);

-}

--- a/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm

+++ /dev/null

@@ -1,1253 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl mbloop_filter_horizontal_edge_y_ppc

-    .globl loop_filter_horizontal_edge_y_ppc

-    .globl mbloop_filter_vertical_edge_y_ppc

-    .globl loop_filter_vertical_edge_y_ppc

-    .globl mbloop_filter_horizontal_edge_uv_ppc

-    .globl loop_filter_horizontal_edge_uv_ppc

-    .globl mbloop_filter_vertical_edge_uv_ppc

-    .globl loop_filter_vertical_edge_uv_ppc

-    .globl loop_filter_simple_horizontal_edge_ppc

-    .globl loop_filter_simple_vertical_edge_ppc

-    .text

-;# We often need to perform transposes (and other transpose-like operations)

-;#   on matrices of data.  This is simplified by the fact that we usually

-;#   operate on hunks of data whose dimensions are powers of 2, or at least

-;#   divisible by highish powers of 2.

-;#

-;#   These operations can be very confusing.  They become more straightforward

-;#   when we think of them as permutations of address bits: Concatenate a

-;#   group of vector registers and think of it as occupying a block of

-;#   memory beginning at address zero.  The low four bits 0...3 of the

-;#   address then correspond to position within a register, the higher-order

-;#   address bits select the register.

-;#

-;#   Although register selection, at the code level, is arbitrary, things

-;#   are simpler if we use contiguous ranges of register numbers, simpler

-;#   still if the low-order bits of the register number correspond to

-;#   conceptual address bits.  We do this whenever reasonable.

-;#

-;#   A 16x16 transpose can then be thought of as an operation on

-;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this

-;#   memory and the effect of a transpose is to interchange address bit

-;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the

-;#   column, which is interchanged with the row addressed by bits 4..7.

-;#

-;#   The altivec merge instructions provide a rapid means of effecting

-;#   many of these transforms.  They operate at three widths (8,16,32).

-;#   Writing V(x) for vector register #x, paired merges permute address

-;#   indices as follows.

-;#

-;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:

-;#

-;#      vmrghb  V( x),          V( y), V( y + (1<<s))

-;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))

-;#

-;#

-;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:

-;#

-;#      vmrghh  V( x),          V( y), V( y + (1<<s))

-;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))

-;#

-;#

-;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:

-;#

-;#      vmrghw  V( x),          V( y), V( y + (1<<s))

-;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))

-;#

-;#

-;#   Unfortunately, there is no doubleword merge instruction.

-;#   The following sequence uses "vperm" is a substitute.

-;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)

-;#   are in registers Vhihi and Vlolo, we can also effect the permutation

-;#

-;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:

-;#

-;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi

-;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo

-;#

-;#

-;#   Except for bits s and d, the other relationships between register

-;#   number (= high-order part of address) bits are at the disposal of

-;#   the programmer.

-;#

-;# To avoid excess transposes, we filter all 3 vertical luma subblock

-;#   edges together.  This requires a single 16x16 transpose, which, in

-;#   the above language, amounts to the following permutation of address

-;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by

-;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.

-;#

-;#   Except for the fact that the destination registers get written

-;#   before we are done referencing the old contents, the cyclic transform

-;#   is effected by

-;#

-;#      x = 0;  do {

-;#          vmrghb V(2x),   V(x), V(x+8);

-;#          vmrghb V(2x+1), V(x), V(x+8);

-;#      } while( ++x < 8);

-;#

-;#   For clarity, and because we can afford it, we do this transpose

-;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,

-;#   leaving the final result in 16 .. 31, as the lower registers are

-;#   used in the filtering itself.

-;#

-.macro Tpair A, B, X, Y

-    vmrghb  \A, \X, \Y

-    vmrglb  \B, \X, \Y

-.endm

-;# Each step takes 8*2 = 16 instructions

-.macro t16_even

-    Tpair v16,v17,  v0,v8

-    Tpair v18,v19,  v1,v9

-    Tpair v20,v21,  v2,v10

-    Tpair v22,v23,  v3,v11

-    Tpair v24,v25,  v4,v12

-    Tpair v26,v27,  v5,v13

-    Tpair v28,v29,  v6,v14

-    Tpair v30,v31,  v7,v15

-.endm

-.macro t16_odd

-    Tpair v0,v1, v16,v24

-    Tpair v2,v3, v17,v25

-    Tpair v4,v5, v18,v26

-    Tpair v6,v7, v19,v27

-    Tpair v8,v9, v20,v28

-    Tpair v10,v11, v21,v29

-    Tpair v12,v13, v22,v30

-    Tpair v14,v15, v23,v31

-.endm

-;# Whole transpose takes 4*16 = 64 instructions

-.macro t16_full

-    t16_odd

-    t16_even

-    t16_odd

-    t16_even

-.endm

-;# Vertical edge filtering requires transposes.  For the simple filter,

-;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels

-;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:

-;#

-;#  v0 =  0  1 ... 14 15

-;#  v1 = 16 17 ... 30 31

-;#  v2 = 32 33 ... 47 48

-;#  v3 = 49 50 ... 62 63

-;#

-;#  In frame-buffer memory, the layout is:

-;#

-;#     0  16  32  48

-;#     1  17  33  49

-;#     ...

-;#    15  31  47  63.

-;#

-;#  We begin by reading the data 32 bits at a time (using scalar operations)

-;#  into a temporary array, reading the rows of the array into vector registers,

-;#  with the following layout:

-;#

-;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60

-;#  v1 =  1 17 33 49  5 21 ...                      45 61

-;#  v2 =  2 18 ...                                  46 62

-;#  v3 =  3 19 ...                                  47 63

-;#

-;#  From the "address-bit" perspective discussed above, we simply need to

-;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.

-;#  In other words, we transpose each of the four 4x4 submatrices.

-;#

-;#  This transformation is its own inverse, and we need to perform it

-;#  again before writing the pixels back into the frame buffer.

-;#

-;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,

-;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors

-;#  defined above.  We think of both groups of 4 registers as having

-;#  "addresses" {0,1,2,3} * 16.

-;#

-.macro Transpose4times4x4 Vlo, Vhi

-    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=

-    vmrghb  v4, v0, v1

-    vmrglb  v5, v0, v1

-    vmrghb  v6, v2, v3

-    vmrglb  v7, v2, v3

-    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1

-    vmrghh  v0, v4, v6

-    vmrglh  v1, v4, v6

-    vmrghh  v2, v5, v7

-    vmrglh  v3, v5, v7

-    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=

-    vmrghw  v4, v0, v1

-    vmrglw  v5, v0, v1

-    vmrghw  v6, v2, v3

-    vmrglw  v7, v2, v3

-    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3

-    vperm   v0, v4, v6, \Vlo

-    vperm   v1, v4, v6, \Vhi

-    vperm   v2, v5, v7, \Vlo

-    vperm   v3, v5, v7, \Vhi

-.endm

-;# end Transpose4times4x4

-;# Normal mb vertical edge filter transpose.

-;#

-;#   We read 8 columns of data, initially in the following pattern:

-;#

-;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)

-;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)

-;#  ...

-;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)

-;#

-;#   and wish to convert to:

-;#

-;#  (0,0) ... (0,15)

-;#  (1,0) ... (1,15)

-;#  ...

-;#  (7,0) ... (7,15).

-;#

-;#  In "address bit" language, we wish to map

-;#

-;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.

-;#

-;#  This can be accomplished by 4 iterations of the cyclic transform

-;#

-;#  I -> (I+1) mod 7;

-;#

-;#  each iteration can be realized by (d=0, s=2):

-;#

-;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);

-;#

-;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;

-;#  preserving v8 = sign converter.

-;#

-;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the

-;#  result lands in the "mirror" registers v10...v17

-;#

-.macro t8x16_odd

-    Tpair v10, v11,  v0, v4

-    Tpair v12, v13,  v1, v5

-    Tpair v14, v15,  v2, v6

-    Tpair v16, v17,  v3, v7

-.endm

-.macro t8x16_even

-    Tpair v0, v1,  v10, v14

-    Tpair v2, v3,  v11, v15

-    Tpair v4, v5,  v12, v16

-    Tpair v6, v7,  v13, v17

-.endm

-.macro transpose8x16_fwd

-    t8x16_odd

-    t8x16_even

-    t8x16_odd

-    t8x16_even

-.endm

-.macro transpose8x16_inv

-    t8x16_odd

-    t8x16_even

-    t8x16_odd

-.endm

-.macro Transpose16x16

-    vmrghb  v0, v16, v24

-    vmrglb  v1, v16, v24

-    vmrghb  v2, v17, v25

-    vmrglb  v3, v17, v25

-    vmrghb  v4, v18, v26

-    vmrglb  v5, v18, v26

-    vmrghb  v6, v19, v27

-    vmrglb  v7, v19, v27

-    vmrghb  v8, v20, v28

-    vmrglb  v9, v20, v28

-    vmrghb  v10, v21, v29

-    vmrglb  v11, v21, v29

-    vmrghb  v12, v22, v30

-    vmrglb  v13, v22, v30

-    vmrghb  v14, v23, v31

-    vmrglb  v15, v23, v31

-    vmrghb  v16, v0, v8

-    vmrglb  v17, v0, v8

-    vmrghb  v18, v1, v9

-    vmrglb  v19, v1, v9

-    vmrghb  v20, v2, v10

-    vmrglb  v21, v2, v10

-    vmrghb  v22, v3, v11

-    vmrglb  v23, v3, v11

-    vmrghb  v24, v4, v12

-    vmrglb  v25, v4, v12

-    vmrghb  v26, v5, v13

-    vmrglb  v27, v5, v13

-    vmrghb  v28, v6, v14

-    vmrglb  v29, v6, v14

-    vmrghb  v30, v7, v15

-    vmrglb  v31, v7, v15

-    vmrghb  v0, v16, v24

-    vmrglb  v1, v16, v24

-    vmrghb  v2, v17, v25

-    vmrglb  v3, v17, v25

-    vmrghb  v4, v18, v26

-    vmrglb  v5, v18, v26

-    vmrghb  v6, v19, v27

-    vmrglb  v7, v19, v27

-    vmrghb  v8, v20, v28

-    vmrglb  v9, v20, v28

-    vmrghb  v10, v21, v29

-    vmrglb  v11, v21, v29

-    vmrghb  v12, v22, v30

-    vmrglb  v13, v22, v30

-    vmrghb  v14, v23, v31

-    vmrglb  v15, v23, v31

-    vmrghb  v16, v0, v8

-    vmrglb  v17, v0, v8

-    vmrghb  v18, v1, v9

-    vmrglb  v19, v1, v9

-    vmrghb  v20, v2, v10

-    vmrglb  v21, v2, v10

-    vmrghb  v22, v3, v11

-    vmrglb  v23, v3, v11

-    vmrghb  v24, v4, v12

-    vmrglb  v25, v4, v12

-    vmrghb  v26, v5, v13

-    vmrglb  v27, v5, v13

-    vmrghb  v28, v6, v14

-    vmrglb  v29, v6, v14

-    vmrghb  v30, v7, v15

-    vmrglb  v31, v7, v15

-.endm

-;# load_g loads a global vector (whose address is in the local variable Gptr)

-;#   into vector register Vreg.  Trashes r0

-.macro load_g Vreg, Gptr

-    lwz     r0, \Gptr

-    lvx     \Vreg, 0, r0

-.endm

-;# exploit the saturation here.  if the answer is negative

-;# it will be clamped to 0.  orring 0 with a positive

-;# number will be the positive number (abs)

-;# RES = abs( A-B), trashes TMP

-.macro Abs RES, TMP, A, B

-    vsububs \RES, \A, \B

-    vsububs \TMP, \B, \A

-    vor     \RES, \RES, \TMP

-.endm

-;# RES = Max( RES, abs( A-B)), trashes TMP

-.macro max_abs RES, TMP, A, B

-    vsububs \TMP, \A, \B

-    vmaxub  \RES, \RES, \TMP

-    vsububs \TMP, \B, \A

-    vmaxub  \RES, \RES, \TMP

-.endm

-.macro Masks

-    ;# build masks

-    ;# input is all 8 bit unsigned (0-255).  need to

-    ;# do abs(vala-valb) > limit.  but no need to compare each

-    ;# value to the limit.  find the max of the absolute differences

-    ;# and compare that to the limit.

-    ;# First hev

-    Abs     v14, v13, v2, v3    ;# |P1 - P0|

-    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|

-    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded

-    ;# Next limit

-    max_abs  v14, v13, v0, v1    ;# |P3 - P2|

-    max_abs  v14, v13, v1, v2    ;# |P2 - P1|

-    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|

-    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|

-    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded

-    ;# flimit

-    Abs     v14, v13, v3, v4    ;# |P0 - Q0|

-    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded

-    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded

-    ;# done building masks

-.endm

-.macro build_constants RFL, RLI, RTH, FL, LI, TH

-    ;# build constants

-    lvx     \FL, 0, \RFL        ;# flimit

-    lvx     \LI, 0, \RLI        ;# limit

-    lvx     \TH, 0, \RTH        ;# thresh

-    vspltisb v11, 8

-    vspltisb v12, 4

-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

-.endm

-.macro load_data_y

-    ;# setup strides/pointers to be able to access

-    ;# all of the data

-    add     r5, r4, r4          ;# r5 = 2 * stride

-    sub     r6, r3, r5          ;# r6 -> 2 rows back

-    neg     r7, r4              ;# r7 = -stride

-    ;# load 16 pixels worth of data to work on

-    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)

-    lvx     v0,  0, r0          ;# P3  (read only)

-    lvx     v1, r7, r6          ;# P2

-    lvx     v2,  0, r6          ;# P1

-    lvx     v3, r7, r3          ;# P0

-    lvx     v4,  0, r3          ;# Q0

-    lvx     v5, r4, r3          ;# Q1

-    lvx     v6, r5, r3          ;# Q2

-    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)

-    lvx     v7, r4, r0          ;# Q3  (read only)

-.endm

-;# Expects

-;#  v10 == HEV

-;#  v13 == tmp

-;#  v14 == tmp

-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT

-    vxor    \P1, \P1, v11       ;# SP1

-    vxor    \P0, \P0, v11       ;# SP0

-    vxor    \Q0, \Q0, v11       ;# SQ0

-    vxor    \Q1, \Q1, v11       ;# SQ1

-    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)

-.if \HEV_PRESENT

-    vand    v13, v13, v10       ;# f &= hev

-.endif

-    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vandc   v13, v13, v8        ;# f &= mask

-    vspltisb v8, 3

-    vspltisb v9, 4

-    vaddsbs v14, v13, v9        ;# f1 = c (f+4)

-    vaddsbs v15, v13, v8        ;# f2 = c (f+3)

-    vsrab   v13, v14, v8        ;# f1 >>= 3

-    vsrab   v15, v15, v8        ;# f2 >>= 3

-    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)

-    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)

-.endm

-.macro vp8_mbfilter

-    Masks

-    ;# start the fitering here

-    vxor    v1, v1, v11         ;# SP2

-    vxor    v2, v2, v11         ;# SP1

-    vxor    v3, v3, v11         ;# SP0

-    vxor    v4, v4, v11         ;# SQ0

-    vxor    v5, v5, v11         ;# SQ1

-    vxor    v6, v6, v11         ;# SQ2

-    ;# add outer taps if we have high edge variance

-    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)

-    vsubsbs v14, v4, v3         ;# SQ0-SP0

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))

-    vandc   v13, v13, v8        ;# f &= mask

-    vand    v15, v13, v10       ;# f2 = f & hev

-    ;# save bottom 3 bits so that we round one side +4 and the other +3

-    vspltisb v8, 3

-    vspltisb v9, 4

-    vaddsbs v14, v15, v9        ;# f1 = c (f+4)

-    vaddsbs v15, v15, v8        ;# f2 = c (f+3)

-    vsrab   v14, v14, v8        ;# f1 >>= 3

-    vsrab   v15, v15, v8        ;# f2 >>= 3

-    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)

-    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)

-    ;# only apply wider filter if not high edge variance

-    vandc   v13, v13, v10       ;# f &= ~hev

-    vspltisb v9, 2

-    vnor    v8, v8, v8

-    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f

-    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f

-    vspltisb v8, 9

-    ;# roughly 1/7th difference across boundary

-    vspltish v10, 7

-    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vmulesb v15, v8, v13

-    vaddshs v14, v14, v9        ;# +=  63

-    vaddshs v15, v15, v9

-    vsrah   v14, v14, v10       ;# >>= 7

-    vsrah   v15, v15, v10

-    vmrglh  v10, v15, v14

-    vmrghh  v15, v15, v14

-    vpkshss v10, v15, v10       ;# X = saturated down to bytes

-    vsubsbs v6, v6, v10         ;# subtract from Q and add to P

-    vaddsbs v1, v1, v10

-    vxor    v6, v6, v11

-    vxor    v1, v1, v11

-    ;# roughly 2/7th difference across boundary

-    vspltish v10, 7

-    vaddubm v12, v8, v8

-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vmulesb v15, v12, v13

-    vaddshs v14, v14, v9

-    vaddshs v15, v15, v9

-    vsrah   v14, v14, v10       ;# >>= 7

-    vsrah   v15, v15, v10

-    vmrglh  v10, v15, v14

-    vmrghh  v15, v15, v14

-    vpkshss v10, v15, v10       ;# X = saturated down to bytes

-    vsubsbs v5, v5, v10         ;# subtract from Q and add to P

-    vaddsbs v2, v2, v10

-    vxor    v5, v5, v11

-    vxor    v2, v2, v11

-    ;# roughly 3/7th difference across boundary

-    vspltish v10, 7

-    vaddubm v12, v12, v8

-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vmulesb v15, v12, v13

-    vaddshs v14, v14, v9

-    vaddshs v15, v15, v9

-    vsrah   v14, v14, v10       ;# >>= 7

-    vsrah   v15, v15, v10

-    vmrglh  v10, v15, v14

-    vmrghh  v15, v15, v14

-    vpkshss v10, v15, v10       ;# X = saturated down to bytes

-    vsubsbs v4, v4, v10         ;# subtract from Q and add to P

-    vaddsbs v3, v3, v10

-    vxor    v4, v4, v11

-    vxor    v3, v3, v11

-.endm

-.macro SBFilter

-    Masks

-    common_adjust v3, v4, v2, v5, 1

-    ;# outer tap adjustments

-    vspltisb v8, 1

-    vaddubm v13, v13, v8        ;# f  += 1

-    vsrab   v13, v13, v8        ;# f >>= 1

-    vandc   v13, v13, v10       ;# f &= ~hev

-    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)

-    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)

-    vxor    v2, v2, v11

-    vxor    v3, v3, v11

-    vxor    v4, v4, v11

-    vxor    v5, v5, v11

-.endm

-    .align 2

-mbloop_filter_horizontal_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r5, r6, r7, v8, v9, v10

-    load_data_y

-    vp8_mbfilter

-    stvx     v1, r7, r6         ;# P2

-    stvx     v2,  0, r6         ;# P1

-    stvx     v3, r7, r3         ;# P0

-    stvx     v4,  0, r3         ;# Q0

-    stvx     v5, r4, r3         ;# Q1

-    stvx     v6, r5, r3         ;# Q2

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-;#  r6 const signed char *limit

-;#  r7 const signed char *thresh

-loop_filter_horizontal_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r5, r6, r7, v8, v9, v10

-    load_data_y

-    SBFilter

-    stvx     v2,  0, r6         ;# P1

-    stvx     v3, r7, r3         ;# P0

-    stvx     v4,  0, r3         ;# Q0

-    stvx     v5, r4, r3         ;# Q1

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.

-;#  So we can read in an entire mb aligned.  However if we want to filter the mb

-;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb

-;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit

-;#  of a waste.  So this is an even uglier way to get around that.

-;# Using the regular register file words are read in and then saved back out to

-;#  memory to align and order them up.  Then they are read in using the

-;#  vector register file.

-.macro RLVmb V, R

-    lwzux   r0, r3, r4

-    stw     r0, 4(\R)

-    lwz     r0,-4(r3)

-    stw     r0, 0(\R)

-    lwzux   r0, r3, r4

-    stw     r0,12(\R)

-    lwz     r0,-4(r3)

-    stw     r0, 8(\R)

-    lvx     \V, 0, \R

-.endm

-.macro WLVmb V, R

-    stvx    \V, 0, \R

-    lwz     r0,12(\R)

-    stwux   r0, r3, r4

-    lwz     r0, 8(\R)

-    stw     r0,-4(r3)

-    lwz     r0, 4(\R)

-    stwux   r0, r3, r4

-    lwz     r0, 0(\R)

-    stw     r0,-4(r3)

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-;#  r6 const signed char *limit

-;#  r7 const signed char *thresh

-mbloop_filter_vertical_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    la      r9, -48(r1)         ;# temporary space for reading in vectors

-    sub     r3, r3, r4

-    RLVmb v0, r9

-    RLVmb v1, r9

-    RLVmb v2, r9

-    RLVmb v3, r9

-    RLVmb v4, r9

-    RLVmb v5, r9

-    RLVmb v6, r9

-    RLVmb v7, r9

-    transpose8x16_fwd

-    build_constants r5, r6, r7, v8, v9, v10

-    vp8_mbfilter

-    transpose8x16_inv

-    add r3, r3, r4

-    neg r4, r4

-    WLVmb v17, r9

-    WLVmb v16, r9

-    WLVmb v15, r9

-    WLVmb v14, r9

-    WLVmb v13, r9

-    WLVmb v12, r9

-    WLVmb v11, r9

-    WLVmb v10, r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro RL V, R, P

-    lvx     \V, 0,  \R

-    add     \R, \R, \P

-.endm

-.macro WL V, R, P

-    stvx    \V, 0,  \R

-    add     \R, \R, \P

-.endm

-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3

-                                ;# K = |P0-P1| already

-    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|

-    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)

-    vcmpgtub v10, v14, v0

-    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]

-    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)

-    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)

-    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)

-    vmaxub   v14, v14, v4       ;# M = max interior abs diff

-    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded

-    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)

-    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded

-    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded

-    ;# replace P1,Q1 w/signed versions

-    common_adjust \P0, \Q0, \P1, \Q1, 1

-    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant

-    vsrab   v13, v13, v1

-    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev

-    vsubsbs \Q1, \Q1, v13

-    vaddsbs \P1, \P1, v13

-    vxor    \P1, \P1, v11       ;# P1

-    vxor    \P0, \P0, v11       ;# P0

-    vxor    \Q0, \Q0, v11       ;# Q0

-    vxor    \Q1, \Q1, v11       ;# Q1

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-;#  r6 const signed char *limit

-;#  r7 const signed char *thresh

-loop_filter_vertical_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    addi    r9, r3, 0

-    RL      v16, r9, r4

-    RL      v17, r9, r4

-    RL      v18, r9, r4

-    RL      v19, r9, r4

-    RL      v20, r9, r4

-    RL      v21, r9, r4

-    RL      v22, r9, r4

-    RL      v23, r9, r4

-    RL      v24, r9, r4

-    RL      v25, r9, r4

-    RL      v26, r9, r4

-    RL      v27, r9, r4

-    RL      v28, r9, r4

-    RL      v29, r9, r4

-    RL      v30, r9, r4

-    lvx     v31, 0, r9

-    Transpose16x16

-    vspltisb v1, 1

-    build_constants r5, r6, r7, v3, v2, v0

-    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|

-    Fil v16, v17, v18, v19,  v20, v21, v22, v23

-    Fil v20, v21, v22, v23,  v24, v25, v26, v27

-    Fil v24, v25, v26, v27,  v28, v29, v30, v31

-    Transpose16x16

-    addi    r9, r3, 0

-    WL      v16, r9, r4

-    WL      v17, r9, r4

-    WL      v18, r9, r4

-    WL      v19, r9, r4

-    WL      v20, r9, r4

-    WL      v21, r9, r4

-    WL      v22, r9, r4

-    WL      v23, r9, r4

-    WL      v24, r9, r4

-    WL      v25, r9, r4

-    WL      v26, r9, r4

-    WL      v27, r9, r4

-    WL      v28, r9, r4

-    WL      v29, r9, r4

-    WL      v30, r9, r4

-    stvx    v31, 0, r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-.macro active_chroma_sel V

-    andi.   r7, r3, 8       ;# row origin modulo 16

-    add     r7, r7, r7      ;# selects selectors

-    lis     r12, _chromaSelectors@ha

-    la      r0,  _chromaSelectors@l(r12)

-    lwzux   r0, r7, r0      ;# leave selector addr in r7

-    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels

-.endm

-.macro hread_uv Dest, U, V, Offs, VMask

-    lvx     \U, \Offs, r3

-    lvx     \V, \Offs, r4

-    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V

-.endm

-.macro hwrite_uv New, U, V, Offs, Umask, Vmask

-    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings

-    vperm   \V, \New, \V, \Vmask

-    stvx    \U, \Offs, r3           ;# Write to frame buffer

-    stvx    \V, \Offs, r4

-.endm

-;# Process U,V in parallel.

-.macro load_chroma_h

-    neg     r9, r5          ;# r9 = -1 * stride

-    add     r8, r9, r9      ;# r8 = -2 * stride

-    add     r10, r5, r5     ;# r10 = 2 * stride

-    active_chroma_sel v12

-    ;# P3, Q3 are read-only; need not save addresses or sibling pels

-    add     r6, r8, r8      ;# r6 = -4 * stride

-    hread_uv v0, v14, v15, r6, v12

-    add     r6, r10, r5     ;# r6 =  3 * stride

-    hread_uv v7, v14, v15, r6, v12

-    ;# Others are read/write; save addresses and sibling pels

-    add     r6, r8, r9      ;# r6 = -3 * stride

-    hread_uv v1, v16, v17, r6,  v12

-    hread_uv v2, v18, v19, r8,  v12

-    hread_uv v3, v20, v21, r9,  v12

-    hread_uv v4, v22, v23, 0,   v12

-    hread_uv v5, v24, v25, r5,  v12

-    hread_uv v6, v26, v27, r10, v12

-.endm

-.macro uresult_sel V

-    load_g   \V, 4(r7)

-.endm

-.macro vresult_sel V

-    load_g   \V, 8(r7)

-.endm

-;# always write P1,P0,Q0,Q1

-.macro store_chroma_h

-    uresult_sel v11

-    vresult_sel v12

-    hwrite_uv v2, v18, v19, r8, v11, v12

-    hwrite_uv v3, v20, v21, r9, v11, v12

-    hwrite_uv v4, v22, v23, 0,  v11, v12

-    hwrite_uv v5, v24, v25, r5, v11, v12

-.endm

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-mbloop_filter_horizontal_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r6, r7, r8, v8, v9, v10

-    load_chroma_h

-    vp8_mbfilter

-    store_chroma_h

-    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2

-    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-loop_filter_horizontal_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r6, r7, r8, v8, v9, v10

-    load_chroma_h

-    SBFilter

-    store_chroma_h

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro R V, R

-    lwzux   r0, r3, r5

-    stw     r0, 4(\R)

-    lwz     r0,-4(r3)

-    stw     r0, 0(\R)

-    lwzux   r0, r4, r5

-    stw     r0,12(\R)

-    lwz     r0,-4(r4)

-    stw     r0, 8(\R)

-    lvx     \V, 0, \R

-.endm

-.macro W V, R

-    stvx    \V, 0, \R

-    lwz     r0,12(\R)

-    stwux   r0, r4, r5

-    lwz     r0, 8(\R)

-    stw     r0,-4(r4)

-    lwz     r0, 4(\R)

-    stwux   r0, r3, r5

-    lwz     r0, 0(\R)

-    stw     r0,-4(r3)

-.endm

-.macro chroma_vread R

-    sub r3, r3, r5          ;# back up one line for simplicity

-    sub r4, r4, r5

-    R v0, \R

-    R v1, \R

-    R v2, \R

-    R v3, \R

-    R v4, \R

-    R v5, \R

-    R v6, \R

-    R v7, \R

-    transpose8x16_fwd

-.endm

-.macro chroma_vwrite R

-    transpose8x16_inv

-    add     r3, r3, r5

-    add     r4, r4, r5

-    neg     r5, r5          ;# Write rows back in reverse order

-    W v17, \R

-    W v16, \R

-    W v15, \R

-    W v14, \R

-    W v13, \R

-    W v12, \R

-    W v11, \R

-    W v10, \R

-.endm

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-mbloop_filter_vertical_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    la      r9, -48(r1)         ;# temporary space for reading in vectors

-    chroma_vread r9

-    build_constants r6, r7, r8, v8, v9, v10

-    vp8_mbfilter

-    chroma_vwrite r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-loop_filter_vertical_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    la      r9, -48(r1)         ;# temporary space for reading in vectors

-    chroma_vread r9

-    build_constants r6, r7, r8, v8, v9, v10

-    SBFilter

-    chroma_vwrite r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-

-.macro vp8_simple_filter

-    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)

-    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit

-    ;# preserve unsigned v0 and v3

-    common_adjust v1, v2, v0, v3, 0

-    vxor v1, v1, v11

-    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels

-.endm

-.macro simple_vertical

-    addi    r8,  0, 16

-    addi    r7, r5, 32

-    lvx     v0,  0, r5

-    lvx     v1, r8, r5

-    lvx     v2,  0, r7

-    lvx     v3, r8, r7

-    lis     r12, _B_hihi@ha

-    la      r0,  _B_hihi@l(r12)

-    lvx     v16, 0, r0

-    lis     r12, _B_lolo@ha

-    la      r0,  _B_lolo@l(r12)

-    lvx     v17, 0, r0

-    Transpose4times4x4 v16, v17

-    vp8_simple_filter

-    vxor v0, v0, v11

-    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels

-    Transpose4times4x4 v16, v17

-    stvx    v0,  0, r5

-    stvx    v1, r8, r5

-    stvx    v2,  0, r7

-    stvx    v3, r8, r7

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-loop_filter_simple_horizontal_edge_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    ;# build constants

-    lvx     v8, 0, r5           ;# flimit

-    vspltisb v11, 8

-    vspltisb v12, 4

-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

-    neg     r5, r4              ;# r5 = -1 * stride

-    add     r6, r5, r5          ;# r6 = -2 * stride

-    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge

-    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge

-    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge

-    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge

-    vp8_simple_filter

-    stvx    v1, r5, r3          ;# store P0

-    stvx    v2,  0, r3          ;# store Q0

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro RLV Offs

-    stw     r0, (\Offs*4)(r5)

-    lwzux   r0, r7, r4

-.endm

-.macro WLV Offs

-    lwz     r0, (\Offs*4)(r5)

-    stwux   r0, r7, r4

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-loop_filter_simple_vertical_edge_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    ;# build constants

-    lvx     v8, 0, r5           ;# flimit

-    vspltisb v11, 8

-    vspltisb v12, 4

-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

-    la r5, -96(r1)              ;# temporary space for reading in vectors

-    ;# Store 4 pels at word "Offs" in temp array, then advance r7

-    ;#   to next row and read another 4 pels from the frame buffer.

-    subi    r7, r3,  2          ;# r7 -> 2 pels before start

-    lwzx    r0,  0, r7          ;# read first 4 pels

-    ;# 16 unaligned word accesses

-    RLV 0

-    RLV 4

-    RLV 8

-    RLV 12

-    RLV 1

-    RLV 5

-    RLV 9

-    RLV 13

-    RLV 2

-    RLV 6

-    RLV 10

-    RLV 14

-    RLV 3

-    RLV 7

-    RLV 11

-    stw     r0, (15*4)(r5)      ;# write last 4 pels

-    simple_vertical

-    ;# Read temp array, write frame buffer.

-    subi    r7, r3,  2          ;# r7 -> 2 pels before start

-    lwzx    r0,  0, r5          ;# read/write first 4 pels

-    stwx    r0,  0, r7

-    WLV 4

-    WLV 8

-    WLV 12

-    WLV 1

-    WLV 5

-    WLV 9

-    WLV 13

-    WLV 2

-    WLV 6

-    WLV 10

-    WLV 14

-    WLV 3

-    WLV 7

-    WLV 11

-    WLV 15

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-_chromaSelectors:

-    .long   _B_hihi

-    .long   _B_Ures0

-    .long   _B_Vres0

-    .long   0

-    .long   _B_lolo

-    .long   _B_Ures8

-    .long   _B_Vres8

-    .long   0

-    .align 4

-_B_Vres8:

-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15

-    .align 4

-_B_Ures8:

-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7

-    .align 4

-_B_lolo:

-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

-    .align 4

-_B_Vres0:

-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

-    .align 4

-_B_Ures0:

-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31

-    .align 4

-_B_hihi:

-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- a/vp9/common/ppc/vp9_platform_altivec.asm

+++ /dev/null

@@ -1,59 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl save_platform_context

-    .globl restore_platform_context

-.macro W V P

-    stvx    \V,  0, \P

-    addi    \P, \P, 16

-.endm

-.macro R V P

-    lvx     \V,  0, \P

-    addi    \P, \P, 16

-.endm

-;# r3 context_ptr

-    .align 2

-save_platform_contex:

-    W v20, r3

-    W v21, r3

-    W v22, r3

-    W v23, r3

-    W v24, r3

-    W v25, r3

-    W v26, r3

-    W v27, r3

-    W v28, r3

-    W v29, r3

-    W v30, r3

-    W v31, r3

-    blr

-;# r3 context_ptr

-    .align 2

-restore_platform_context:

-    R v20, r3

-    R v21, r3

-    R v22, r3

-    R v23, r3

-    R v24, r3

-    R v25, r3

-    R v26, r3

-    R v27, r3

-    R v28, r3

-    R v29, r3

-    R v30, r3

-    R v31, r3

-    blr

--- a/vp9/common/ppc/vp9_recon_altivec.asm

+++ /dev/null

@@ -1,175 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl recon4b_ppc

-    .globl recon2b_ppc

-    .globl recon_b_ppc

-.macro row_of16 Diff Pred Dst Stride

-    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15

-    addi    \Pred, \Pred, 16        ;# next pred

-    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7

-    lvx     v3,  0, \Diff           ;# v3 = d0..d7

-    vaddshs v2, v2, v3              ;# v2 = r0..r7

-    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15

-    lvx     v3, r8, \Diff           ;# v3 = d8..d15

-    addi    \Diff, \Diff, 32        ;# next diff

-    vaddshs v3, v3, v1              ;# v3 = r8..r15

-    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15

-    stvx    v2,  0, \Dst            ;# to dst

-    add     \Dst, \Dst, \Stride     ;# next dst

-.endm

-    .text

-    .align 2

-;#  r3 = short *diff_ptr,

-;#  r4 = unsigned char *pred_ptr,

-;#  r5 = unsigned char *dst_ptr,

-;#  r6 = int stride

-recon4b_ppc:

-    mfspr   r0, 256                     ;# get old VRSAVE

-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

-    oris    r0, r0, 0xf000

-    mtspr   256,r0                      ;# set VRSAVE

-    vxor    v0, v0, v0

-    li      r8, 16

-    row_of16 r3, r4, r5, r6

-    row_of16 r3, r4, r5, r6

-    row_of16 r3, r4, r5, r6

-    row_of16 r3, r4, r5, r6

-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

-    mtspr   256, r12                    ;# reset old VRSAVE

-    blr

-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels

-    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15

-    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7

-    lvx     v3,  0, \Diff       ;# v3 = d0..d7

-    vaddshs v2, v2, v3          ;# v2 = r0..r7

-    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15

-    lvx     v3, r8, \Diff       ;# v2 = d8..d15

-    vaddshs v3, v3, v1          ;# v3 = r8..r15

-    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15

-    stvx    v2,  0, r10         ;# 2 rows to dst from buf

-    lwz     r0, 0(r10)

-.if \write_first_four_pels

-    stw     r0, 0(\Dst)

-    .else

-    stwux   r0, \Dst, \Stride

-.endif

-    lwz     r0, 4(r10)

-    stw     r0, 4(\Dst)

-    lwz     r0, 8(r10)

-    stwux   r0, \Dst, \Stride       ;# advance dst to next row

-    lwz     r0, 12(r10)

-    stw     r0, 4(\Dst)

-.endm

-    .align 2

-;#  r3 = short *diff_ptr,

-;#  r4 = unsigned char *pred_ptr,

-;#  r5 = unsigned char *dst_ptr,

-;#  r6 = int stride

-recon2b_ppc:

-    mfspr   r0, 256                     ;# get old VRSAVE

-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

-    oris    r0, r0, 0xf000

-    mtspr   256,r0                      ;# set VRSAVE

-    vxor    v0, v0, v0

-    li      r8, 16

-    la      r10, -48(r1)                ;# buf

-    two_rows_of8 r3, r4, r5, r6, 1

-    addi    r4, r4, 16;                 ;# next pred

-    addi    r3, r3, 32;                 ;# next diff

-    two_rows_of8 r3, r4, r5, r6, 0

-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

-    mtspr   256, r12                    ;# reset old VRSAVE

-    blr

-.macro get_two_diff_rows

-    stw     r0, 0(r10)

-    lwz     r0, 4(r3)

-    stw     r0, 4(r10)

-    lwzu    r0, 32(r3)

-    stw     r0, 8(r10)

-    lwz     r0, 4(r3)

-    stw     r0, 12(r10)

-    lvx     v3, 0, r10

-.endm

-    .align 2

-;#  r3 = short *diff_ptr,

-;#  r4 = unsigned char *pred_ptr,

-;#  r5 = unsigned char *dst_ptr,

-;#  r6 = int stride

-recon_b_ppc:

-    mfspr   r0, 256                     ;# get old VRSAVE

-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

-    oris    r0, r0, 0xf000

-    mtspr   256,r0                      ;# set VRSAVE

-    vxor    v0, v0, v0

-    la      r10, -48(r1)    ;# buf

-    lwz     r0, 0(r4)

-    stw     r0, 0(r10)

-    lwz     r0, 16(r4)

-    stw     r0, 4(r10)

-    lwz     r0, 32(r4)

-    stw     r0, 8(r10)

-    lwz     r0, 48(r4)

-    stw     r0, 12(r10)

-    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15

-    lwz r0, 0(r3)           ;# v3 = d0..d7

-    get_two_diff_rows

-    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7

-    vaddshs v2, v2, v3;     ;# v2 = r0..r7

-    lwzu r0, 32(r3)         ;# v3 = d8..d15

-    get_two_diff_rows

-    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15

-    vaddshs v3, v3, v1;     ;# v3 = r8..r15

-    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15

-    stvx    v2,  0, r10;    ;# 16 pels to dst from buf

-    lwz     r0, 0(r10)

-    stw     r0, 0(r5)

-    lwz     r0, 4(r10)

-    stwux   r0, r5, r6

-    lwz     r0, 8(r10)

-    stwux   r0, r5, r6

-    lwz     r0, 12(r10)

-    stwx    r0, r5, r6

-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

-    mtspr   256, r12                    ;# reset old VRSAVE

-    blr

--- a/vp9/common/ppc/vp9_systemdependent.c

+++ /dev/null

@@ -1,167 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9/common/vp9_loopfilter.h"

-#include "recon.h"

-#include "vp9/common/vp9_onyxc_int.h"

-void (*vp8_short_idct4x4)(short *input, short *output, int pitch);

-void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);

-void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);

-extern void (*vp9_post_proc_down_and_across)(unsigned char *src_ptr,

-                                             unsigned char *dst_ptr,

-                                             int src_pixels_per_line,

-                                             int dst_pixels_per_line,

-                                             int rows, int cols, int flimit);

-extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch,

-                                    int rows, int cols, int flimit);

-extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,

-                                   int rows, int cols, int flimit);

-extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch,

-                                         int rows, int cols, int flimit);

-extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,

-                                        int rows, int cols, int flimit);

-extern void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,

-                                            unsigned char *dst_ptr,

-                                            int src_pixels_per_line,

-                                            int dst_pixels_per_line,

-                                            int rows, int cols, int flimit);

-void vp9_plane_add_noise_c(unsigned char *start,

-                           unsigned int width, unsigned int height,

-                           int pitch, int q, int a);

-extern copy_mem_block_function *vp9_copy_mem16x16;

-extern copy_mem_block_function *vp9_copy_mem8x8;

-extern copy_mem_block_function *vp9_copy_mem8x4;

-// PPC

-extern subpixel_predict_function sixtap_predict_ppc;

-extern subpixel_predict_function sixtap_predict8x4_ppc;

-extern subpixel_predict_function sixtap_predict8x8_ppc;

-extern subpixel_predict_function sixtap_predict16x16_ppc;

-extern subpixel_predict_function bilinear_predict4x4_ppc;

-extern subpixel_predict_function bilinear_predict8x4_ppc;

-extern subpixel_predict_function bilinear_predict8x8_ppc;

-extern subpixel_predict_function bilinear_predict16x16_ppc;

-extern copy_mem_block_function copy_mem16x16_ppc;

-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr,

-                 unsigned char *dst_ptr, int stride);

-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr,

-                 unsigned char *dst_ptr, int stride);

-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr,

-                 unsigned char *dst_ptr, int stride);

-extern void short_idct4x4_ppc(short *input, short *output, int pitch);

-// Generic C

-extern subpixel_predict_function vp9_sixtap_predict_c;

-extern subpixel_predict_function vp9_sixtap_predict8x4_c;

-extern subpixel_predict_function vp9_sixtap_predict8x8_c;

-extern subpixel_predict_function vp9_sixtap_predict16x16_c;

-extern subpixel_predict_function vp9_bilinear_predict4x4_c;

-extern subpixel_predict_function vp9_bilinear_predict8x4_c;

-extern subpixel_predict_function vp9_bilinear_predict8x8_c;

-extern subpixel_predict_function vp9_bilinear_predict16x16_c;

-extern copy_mem_block_function vp9_copy_mem16x16_c;

-extern copy_mem_block_function vp9_copy_mem8x8_c;

-extern copy_mem_block_function vp9_copy_mem8x4_c;

-void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr,

-                   unsigned char *dst_ptr, int stride);

-void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr,

-                   unsigned char *dst_ptr, int stride);

-void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr,

-                   unsigned char *dst_ptr, int stride);

-extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch);

-extern void vp9_short_idct4x4_c(short *input, short *output, int pitch);

-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);

-// PPC

-extern loop_filter_block_function loop_filter_mbv_ppc;

-extern loop_filter_block_function loop_filter_bv_ppc;

-extern loop_filter_block_function loop_filter_mbh_ppc;

-extern loop_filter_block_function loop_filter_bh_ppc;

-extern loop_filter_block_function loop_filter_mbvs_ppc;

-extern loop_filter_block_function loop_filter_bvs_ppc;

-extern loop_filter_block_function loop_filter_mbhs_ppc;

-extern loop_filter_block_function loop_filter_bhs_ppc;

-// Generic C

-extern loop_filter_block_function vp9_loop_filter_mbv_c;

-extern loop_filter_block_function vp9_loop_filter_bv_c;

-extern loop_filter_block_function vp9_loop_filter_mbh_c;

-extern loop_filter_block_function vp9_loop_filter_bh_c;

-extern loop_filter_block_function vp9_loop_filter_mbvs_c;

-extern loop_filter_block_function vp9_loop_filter_bvs_c;

-extern loop_filter_block_function vp9_loop_filter_mbhs_c;

-extern loop_filter_block_function vp9_loop_filter_bhs_c;

-extern loop_filter_block_function *vp8_lf_mbvfull;

-extern loop_filter_block_function *vp8_lf_mbhfull;

-extern loop_filter_block_function *vp8_lf_bvfull;

-extern loop_filter_block_function *vp8_lf_bhfull;

-extern loop_filter_block_function *vp8_lf_mbvsimple;

-extern loop_filter_block_function *vp8_lf_mbhsimple;

-extern loop_filter_block_function *vp8_lf_bvsimple;

-extern loop_filter_block_function *vp8_lf_bhsimple;

-void vp9_clear_c(void) {

-}

-void vp9_machine_specific_config(void) {

-  // Pure C:

-  vp9_clear_system_state                = vp9_clear_c;

-  vp9_recon_b                          = vp9_recon_b_c;

-  vp9_recon4b                         = vp9_recon4b_c;

-  vp9_recon2b                         = vp9_recon2b_c;

-  vp9_bilinear_predict16x16            = bilinear_predict16x16_ppc;

-  vp9_bilinear_predict8x8              = bilinear_predict8x8_ppc;

-  vp9_bilinear_predict8x4              = bilinear_predict8x4_ppc;

-  vp8_bilinear_predict                 = bilinear_predict4x4_ppc;

-  vp9_sixtap_predict16x16              = sixtap_predict16x16_ppc;

-  vp9_sixtap_predict8x8                = sixtap_predict8x8_ppc;

-  vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;

-  vp9_sixtap_predict                   = sixtap_predict_ppc;

-  vp8_short_idct4x4_1                  = vp9_short_idct4x4_1_c;

-  vp8_short_idct4x4                    = short_idct4x4_ppc;

-  vp8_dc_only_idct                      = vp8_dc_only_idct_c;

-  vp8_lf_mbvfull                       = loop_filter_mbv_ppc;

-  vp8_lf_bvfull                        = loop_filter_bv_ppc;

-  vp8_lf_mbhfull                       = loop_filter_mbh_ppc;

-  vp8_lf_bhfull                        = loop_filter_bh_ppc;

-  vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;

-  vp8_lf_bvsimple                      = loop_filter_bvs_ppc;

-  vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;

-  vp8_lf_bhsimple                      = loop_filter_bhs_ppc;

-  vp9_post_proc_down_and_across           = vp9_post_proc_down_and_across_c;

-  vp9_mbpost_proc_down                  = vp9_mbpost_proc_down_c;

-  vp9_mbpost_proc_across_ip              = vp9_mbpost_proc_across_ip_c;

-  vp9_plane_add_noise                   = vp9_plane_add_noise_c;

-  vp9_copy_mem16x16                    = copy_mem16x16_ppc;

-  vp9_copy_mem8x8                      = vp9_copy_mem8x8_c;

-  vp9_copy_mem8x4                      = vp9_copy_mem8x4_c;

-}

--- a/vp9/common/vp9_alloccommon.c

+++ b/vp9/common/vp9_alloccommon.c

@@ -10,84 +10,109 @@

 #include "./vpx_config.h"

-#include "vp9/common/vp9_blockd.h"

 #include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_onyxc_int.h"

-#include "vp9/common/vp9_findnearmv.h"

+#include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_entropymode.h"

 #include "vp9/common/vp9_entropymv.h"

+#include "vp9/common/vp9_findnearmv.h"

+#include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_systemdependent.h"

-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {

-  int stride = cpi->mode_info_stride;

+void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {

+  const int stride = cm->mode_info_stride;

   int i;

   // Clear down top border row

-  vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);

+  vpx_memset(mi, 0, sizeof(MODE_INFO) * stride);

   // Clear left border column

-  for (i = 1; i < cpi->mb_rows + 1; i++) {

-    vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));

-  }

+  for (i = 1; i < cm->mi_rows + 1; i++)

+    vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));

-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {

+void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) {

   int i, j;

   // For each in image mode_info element set the in image flag to 1

-  for (i = 0; i < cpi->mb_rows; i++) {

-    for (j = 0; j < cpi->mb_cols; j++) {

-      mi->mbmi.mb_in_image = 1;

-      mi++;   // Next element in the row

+  for (i = 0; i < cm->mi_rows; i++) {

+    MODE_INFO *ptr = mi;

+    for (j = 0; j < cm->mi_cols; j++) {

+      ptr->mbmi.mb_in_image = 1;

+      ptr++;  // Next element in the row

-    mi++;       // Step over border element at start of next row

+    // Step over border element at start of next row

+    mi += cm->mode_info_stride;

-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {

+void vp9_free_frame_buffers(VP9_COMMON *oci) {

   int i;

   for (i = 0; i < NUM_YV12_BUFFERS; i++)

-    vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);

+    vp9_free_frame_buffer(&oci->yv12_fb[i]);

-  vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);

-  vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);

+  vp9_free_frame_buffer(&oci->temp_scale_frame);

+  vp9_free_frame_buffer(&oci->post_proc_buffer);

-  vpx_free(oci->above_context);

   vpx_free(oci->mip);

   vpx_free(oci->prev_mip);

+  vpx_free(oci->above_seg_context);

-  oci->above_context = 0;

+  vpx_free(oci->above_context[0]);

+  for (i = 0; i < MAX_MB_PLANE; i++)

+    oci->above_context[i] = 0;

   oci->mip = 0;

   oci->prev_mip = 0;

+  oci->above_seg_context = 0;

+}

+static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {

+  cm->mb_cols = (aligned_width + 8) >> 4;

+  cm->mb_rows = (aligned_height + 8) >> 4;

+  cm->MBs = cm->mb_rows * cm->mb_cols;

+  cm->mi_cols = aligned_width >> LOG2_MI_SIZE;

+  cm->mi_rows = aligned_height >> LOG2_MI_SIZE;

+  cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE;

+static void setup_mi(VP9_COMMON *cm) {

+  cm->mi = cm->mip + cm->mode_info_stride + 1;

+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;

+  vpx_memset(cm->mip, 0,

+             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

+  vp9_update_mode_info_border(cm, cm->mip);

+  vp9_update_mode_info_in_image(cm, cm->mi);

+  vp9_update_mode_info_border(cm, cm->prev_mip);

+  vp9_update_mode_info_in_image(cm, cm->prev_mi);

+}

 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {

-  int i;

-  int aligned_width, aligned_height;

+  int i, mi_cols;

-  vp9_de_alloc_frame_buffers(oci);

+  // Our internal buffers are always multiples of 16

+  const int aligned_width = multiple8(width);

+  const int aligned_height = multiple8(height);

+  const int ss_x = oci->subsampling_x;

+  const int ss_y = oci->subsampling_y;

-  /* our internal buffers are always multiples of 16 */

-  aligned_width = (width + 15) & ~15;

-  aligned_height = (height + 15) & ~15;

+  vp9_free_frame_buffers(oci);

   for (i = 0; i < NUM_YV12_BUFFERS; i++) {

     oci->fb_idx_ref_cnt[i] = 0;

-    if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height,

-                                    VP9BORDERINPIXELS) < 0) {

-      vp9_de_alloc_frame_buffers(oci);

-      return 1;

-    }

+    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,

+                               VP9BORDERINPIXELS) < 0)

+      goto fail;

   oci->new_fb_idx = NUM_YV12_BUFFERS - 1;

   oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;

-  for (i = 0; i < 3; i++)

+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)

     oci->active_ref_idx[i] = i;

   for (i = 0; i < NUM_REF_FRAMES; i++) {

@@ -95,125 +120,86 @@

     oci->fb_idx_ref_cnt[i] = 1;

-  if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16,

-                                  VP9BORDERINPIXELS) < 0) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

+  if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y,

+                             VP9BORDERINPIXELS) < 0)

+    goto fail;

-  if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height,

-                                  VP9BORDERINPIXELS) < 0) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

+  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,

+                             VP9BORDERINPIXELS) < 0)

+    goto fail;

-  oci->mb_rows = aligned_height >> 4;

-  oci->mb_cols = aligned_width >> 4;

-  oci->MBs = oci->mb_rows * oci->mb_cols;

-  oci->mode_info_stride = oci->mb_cols + 1;

-  oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));

+  set_mb_mi(oci, aligned_width, aligned_height);

-  if (!oci->mip) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

+  // Allocation

+  oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE),

+                        sizeof(MODE_INFO));

+  if (!oci->mip)

+    goto fail;

-  oci->mi = oci->mip + oci->mode_info_stride + 1;

+  oci->prev_mip = vpx_calloc(oci->mode_info_stride *

+                             (oci->mi_rows + 64 / MI_SIZE),

+                             sizeof(MODE_INFO));

+  if (!oci->prev_mip)

+    goto fail;

-  /* allocate memory for last frame MODE_INFO array */

+  setup_mi(oci);

-  oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));

+  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling

+  // information is exposed at this level

+  mi_cols = mi_cols_aligned_to_sb(oci);

-  if (!oci->prev_mip) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

+  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm

+  // block where mi unit size is 8x8.

+# if CONFIG_ALPHA

+  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1);

+#else

+  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1);

+#endif

+  if (!oci->above_context[0])

+    goto fail;

-  oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;

+  for (i = 1; i < MAX_MB_PLANE; i++)

+    oci->above_context[i] =

+        oci->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;

-  oci->above_context =

-    vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * (3 + oci->mb_cols), 1);

+  oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);

+  if (!oci->above_seg_context)

+    goto fail;

-  if (!oci->above_context) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

-  vp9_update_mode_info_border(oci, oci->mip);

-  vp9_update_mode_info_in_image(oci, oci->mi);

   return 0;

-}

-void vp9_setup_version(VP9_COMMON *cm) {

-  if (cm->version & 0x4) {

-    if (!CONFIG_EXPERIMENTAL)

-      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,

-                         "Bitstream was created by an experimental "

-                         "encoder");

-    cm->experimental = 1;

-  }

-  switch (cm->version & 0x3) {

-    case 0:

-      cm->no_lpf = 0;

-      cm->filter_type = NORMAL_LOOPFILTER;

-      cm->use_bilinear_mc_filter = 0;

-      cm->full_pixel = 0;

-      break;

-    case 1:

-      cm->no_lpf = 0;

-      cm->filter_type = SIMPLE_LOOPFILTER;

-      cm->use_bilinear_mc_filter = 1;

-      cm->full_pixel = 0;

-      break;

-    case 2:

-    case 3:

-      cm->no_lpf = 1;

-      cm->filter_type = NORMAL_LOOPFILTER;

-      cm->use_bilinear_mc_filter = 1;

-      cm->full_pixel = 0;

-      break;

-      // Full pel only code deprecated in experimental code base

-      // case 3:

-      //    cm->no_lpf = 1;

-      //    cm->filter_type = SIMPLE_LOOPFILTER;

-      //    cm->use_bilinear_mc_filter = 1;

-      //    cm->full_pixel = 1;

-      //    break;

-  }

+ fail:

+  vp9_free_frame_buffers(oci);

+  return 1;

 void vp9_create_common(VP9_COMMON *oci) {

   vp9_machine_specific_config(oci);

   vp9_init_mbmode_probs(oci);

-  vp9_default_bmode_probs(oci->fc.bmode_prob);

   oci->txfm_mode = ONLY_4X4;

-  oci->mb_no_coeff_skip = 1;

   oci->comp_pred_mode = HYBRID_PREDICTION;

-  oci->no_lpf = 0;

-  oci->filter_type = NORMAL_LOOPFILTER;

-  oci->use_bilinear_mc_filter = 0;

-  oci->full_pixel = 0;

   oci->clr_type = REG_YUV;

-  oci->clamp_type = RECON_CLAMP_REQUIRED;

-  /* Initialise reference frame sign bias structure to defaults */

+  // Initialize reference frame sign bias structure to defaults

   vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));

-  oci->kf_ymode_probs_update = 0;

 void vp9_remove_common(VP9_COMMON *oci) {

-  vp9_de_alloc_frame_buffers(oci);

+  vp9_free_frame_buffers(oci);

 void vp9_initialize_common() {

   vp9_coef_tree_initialize();

   vp9_entropy_mode_init();

   vp9_entropy_mv_init();

+}

+void vp9_update_frame_size(VP9_COMMON *cm) {

+  const int aligned_width = multiple8(cm->width);

+  const int aligned_height = multiple8(cm->height);

+  set_mb_mi(cm, aligned_width, aligned_height);

+  setup_mi(cm);

--- a/vp9/common/vp9_alloccommon.h

+++ b/vp9/common/vp9_alloccommon.h

@@ -14,13 +14,18 @@

 #include "vp9/common/vp9_onyxc_int.h"

+void vp9_initialize_common();

+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);

+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);

 void vp9_create_common(VP9_COMMON *oci);

 void vp9_remove_common(VP9_COMMON *oci);

-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);

 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);

-void vp9_setup_version(VP9_COMMON *oci);

+void vp9_free_frame_buffers(VP9_COMMON *oci);

-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);

-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);

+void vp9_update_frame_size(VP9_COMMON *cm);

 #endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_

--- a/vp9/common/vp9_blockd.c

+++ /dev/null

@@ -1,442 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9/common/vp9_blockd.h"

-#include "vpx_mem/vpx_mem.h"

-const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24] = {

-  { 0, 0, 0, 0,

-    1, 1, 1, 1,

-    2, 2, 2, 2,

-    3, 3, 3, 3,

-    4, 4,

-    5, 5,

-    6, 6,

-    7, 7 },

-  { 0, 0, 0, 0,

-    0, 0, 0, 0,

-    2, 2, 2, 2,

-    2, 2, 2, 2,

-    4, 4,

-    4, 4,

-    6, 6,

-    6, 6 },

-  { 0, 0, 0, 0,

-    0, 0, 0, 0,

-    0, 0, 0, 0,

-    0, 0, 0, 0 },

-};

-const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24] = {

-  { 0, 1, 2, 3,

-    0, 1, 2, 3,

-    0, 1, 2, 3,

-    0, 1, 2, 3,

-    4, 5,

-    4, 5,

-    6, 7,

-    6, 7 },

-  { 0, 0, 0, 0,

-    2, 2, 2, 2,

-    0, 0, 0, 0,

-    2, 2, 2, 2,

-    4, 4,

-    4, 4,

-    6, 6,

-    6, 6 },

-  { 0, 0, 0, 0,

-    0, 0, 0, 0,

-    0, 0, 0, 0,

-    0, 0, 0, 0 },

-};

-#define S(x) x + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT)

-const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96] = {

-  { 0, 0, 0, 0, 0, 0, 0, 0,

-    1, 1, 1, 1, 1, 1, 1, 1,

-    2, 2, 2, 2, 2, 2, 2, 2,

-    3, 3, 3, 3, 3, 3, 3, 3,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),

-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

-    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),

-    4, 4, 4, 4,

-    5, 5, 5, 5,

-    S(4), S(4), S(4), S(4),

-    S(5), S(5), S(5), S(5),

-    6, 6, 6, 6,

-    7, 7, 7, 7,

-    S(6), S(6), S(6), S(6),

-    S(7), S(7), S(7), S(7) },

-  { 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    2, 2, 2, 2, 2, 2, 2, 2,

-    2, 2, 2, 2, 2, 2, 2, 2,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

-    4, 4, 4, 4,

-    4, 4, 4, 4,

-    S(4), S(4), S(4), S(4),

-    S(4), S(4), S(4), S(4),

-    6, 6, 6, 6,

-    6, 6, 6, 6,

-    S(6), S(6), S(6), S(6),

-    S(6), S(6), S(6), S(6) },

-  { 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    4, 4, 4, 4,

-    4, 4, 4, 4,

-    4, 4, 4, 4,

-    4, 4, 4, 4,

-    6, 6, 6, 6,

-    6, 6, 6, 6,

-    6, 6, 6, 6,

-    6, 6, 6, 6 },

-  { 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0 },

-};

-const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96] = {

-  { 0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3),

-    4, 5, S(4), S(5),

-    4, 5, S(4), S(5),

-    4, 5, S(4), S(5),

-    4, 5, S(4), S(5),

-    6, 7, S(6), S(7),

-    6, 7, S(6), S(7),

-    6, 7, S(6), S(7),

-    6, 7, S(6), S(7) },

-  { 0, 0, 0, 0, 2, 2, 2, 2,

-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    0, 0, 0, 0, 2, 2, 2, 2,

-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    0, 0, 0, 0, 2, 2, 2, 2,

-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    0, 0, 0, 0, 2, 2, 2, 2,

-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    4, 4, 4, 4,

-    S(4), S(4), S(4), S(4),

-    4, 4, 4, 4,

-    S(4), S(4), S(4), S(4),

-    6, 6, 6, 6,

-    S(6), S(6), S(6), S(6),

-    6, 6, 6, 6,

-    S(6), S(6), S(6), S(6) },

-  { 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    4, 4, 4, 4,

-    4, 4, 4, 4,

-    4, 4, 4, 4,

-    4, 4, 4, 4,

-    6, 6, 6, 6,

-    6, 6, 6, 6,

-    6, 6, 6, 6,

-    6, 6, 6, 6 },

-  { 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0 },

-};

-#define T(x) x + 2 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))

-#define U(x) x + 3 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))

-const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384] = {

-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),

-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

-    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1),

-    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),

-    T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1),

-    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),

-    U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3),

-    4, 4, 4, 4, 4, 4, 4, 4,

-    5, 5, 5, 5, 5, 5, 5, 5,

-    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),

-    S(5), S(5), S(5), S(5), S(5), S(5), S(5), S(5),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    T(5), T(5), T(5), T(5), T(5), T(5), T(5), T(5),

-    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),

-    U(5), U(5), U(5), U(5), U(5), U(5), U(5), U(5),

-    6, 6, 6, 6, 6, 6, 6, 6,

-    7, 7, 7, 7, 7, 7, 7, 7,

-    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),

-    S(7), S(7), S(7), S(7), S(7), S(7), S(7), S(7),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    T(7), T(7), T(7), T(7), T(7), T(7), T(7), T(7),

-    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),

-    U(7), U(7), U(7), U(7), U(7), U(7), U(7), U(7) },

-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),

-    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),

-    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),

-    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),

-    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),

-    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),

-    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6) },

-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },

-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6 },

-};

-const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384] = {

-  { 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7) },

-  { 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

-    4, 4, 4, 4, S(4), S(4), S(4), S(4),

-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

-    4, 4, 4, 4, S(4), S(4), S(4), S(4),

-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

-    4, 4, 4, 4, S(4), S(4), S(4), S(4),

-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

-    4, 4, 4, 4, S(4), S(4), S(4), S(4),

-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

-    6, 6, 6, 6, S(6), S(6), S(6), S(6),

-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),

-    6, 6, 6, 6, S(6), S(6), S(6), S(6),

-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),

-    6, 6, 6, 6, S(6), S(6), S(6), S(6),

-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),

-    6, 6, 6, 6, S(6), S(6), S(6), S(6),

-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6) },

-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },

-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    4, 4, 4, 4, 4, 4, 4, 4,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 6, 6, 6, 6 },

-};

-#undef U

-#undef T

-#undef S

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -12,8 +12,6 @@

 #ifndef VP9_COMMON_VP9_BLOCKD_H_

 #define VP9_COMMON_VP9_BLOCKD_H_

-void vpx_log(const char *format, ...);

 #include "./vpx_config.h"

 #include "vpx_scale/yv12config.h"

 #include "vp9/common/vp9_convolve.h"

@@ -21,35 +19,27 @@

 #include "vp9/common/vp9_treecoder.h"

 #include "vpx_ports/mem.h"

 #include "vp9/common/vp9_common.h"

+#include "vp9/common/vp9_enums.h"

-#define TRUE    1

-#define FALSE   0

+#define BLOCK_SIZE_GROUPS   4

+#define MAX_MB_SEGMENTS     8

+#define MB_SEG_TREE_PROBS   (MAX_MB_SEGMENTS-1)

-// #define MODE_STATS

-/*#define DCPRED 1*/

-#define DCPREDSIMTHRESH 0

-#define DCPREDCNTTHRESH 3

-#define MB_FEATURE_TREE_PROBS   3

 #define PREDICTION_PROBS 3

 #define MBSKIP_CONTEXTS 3

-#define MAX_MB_SEGMENTS         4

 #define MAX_REF_LF_DELTAS       4

-#define MAX_MODE_LF_DELTAS      4

+#define MAX_MODE_LF_DELTAS      2

 /* Segment Feature Masks */

 #define SEGMENT_DELTADATA   0

 #define SEGMENT_ABSDATA     1

-#define MAX_MV_REFS 9

-#define MAX_MV_REF_CANDIDATES 4

+#define MAX_MV_REF_CANDIDATES 2

-typedef struct {

-  int r, c;

-} POS;

+#define INTRA_INTER_CONTEXTS 4

+#define COMP_INTER_CONTEXTS 5

+#define REF_CONTEXTS 5

 typedef enum {

   PLANE_TYPE_Y_WITH_DC,

@@ -57,24 +47,21 @@

 } PLANE_TYPE;

 typedef char ENTROPY_CONTEXT;

-typedef struct {

-  ENTROPY_CONTEXT y1[4];

-  ENTROPY_CONTEXT u[2];

-  ENTROPY_CONTEXT v[2];

-} ENTROPY_CONTEXT_PLANES;

-#define VP9_COMBINEENTROPYCONTEXTS(Dest, A, B) \

-  Dest = ((A)!=0) + ((B)!=0);

+typedef char PARTITION_CONTEXT;

+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,

+                                           ENTROPY_CONTEXT b) {

+  return (a != 0) + (b != 0);

+}

 typedef enum {

   KEY_FRAME = 0,

-  INTER_FRAME = 1

+  INTER_FRAME = 1,

+  NUM_FRAME_TYPES,

 } FRAME_TYPE;

 typedef enum {

-#if CONFIG_ENABLE_6TAP

-  SIXTAP,

-#endif

   EIGHTTAP_SMOOTH,

   EIGHTTAP,

   EIGHTTAP_SHARP,

@@ -83,26 +70,27 @@

 } INTERPOLATIONFILTERTYPE;

 typedef enum {

-  DC_PRED,            /* average of above and left pixels */

-  V_PRED,             /* vertical prediction */

-  H_PRED,             /* horizontal prediction */

-  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */

-  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */

-  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */

-  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */

-  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */

-  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */

-  TM_PRED,            /* Truemotion prediction */

-  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */

-  B_PRED,             /* block based prediction, each block has its own prediction mode */

+  DC_PRED,         // Average of above and left pixels

+  V_PRED,          // Vertical

+  H_PRED,          // Horizontal

+  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)

+  D135_PRED,       // Directional 135 deg = 180 - 45

+  D117_PRED,       // Directional 117 deg = 180 - 63

+  D153_PRED,       // Directional 153 deg = 180 - 27

+  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)

+  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)

+  TM_PRED,         // True-motion

   NEARESTMV,

   NEARMV,

   ZEROMV,

   NEWMV,

-  SPLITMV,

   MB_MODE_COUNT

 } MB_PREDICTION_MODE;

+static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {

+  return mode >= NEARESTMV && mode <= NEWMV;

+}

 // Segment level features.

 typedef enum {

   SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....

@@ -117,8 +105,7 @@

   TX_4X4 = 0,                      // 4x4 dct transform

   TX_8X8 = 1,                      // 8x8 dct transform

   TX_16X16 = 2,                    // 16x16 dct transform

-  TX_SIZE_MAX_MB = 3,              // Number of different transforms available

-  TX_32X32 = TX_SIZE_MAX_MB,       // 32x32 dct transform

+  TX_32X32 = 3,                    // 32x32 dct transform

   TX_SIZE_MAX_SB,                  // Number of transforms available to SBs

 } TX_SIZE;

@@ -129,62 +116,19 @@

   ADST_ADST = 3                       // ADST in both directions

 } TX_TYPE;

-#define VP9_YMODES  (B_PRED + 1)

-#define VP9_UV_MODES (TM_PRED + 1)

-#define VP9_I8X8_MODES (TM_PRED + 1)

-#define VP9_I32X32_MODES (TM_PRED + 1)

+#define VP9_INTRA_MODES (TM_PRED + 1)

-#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)

+#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)

 #define WHT_UPSCALE_FACTOR 2

-typedef enum {

-  B_DC_PRED,          /* average of above and left pixels */

-  B_TM_PRED,

+#define TX_SIZE_PROBS  6  // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)

-  B_VE_PRED,          /* vertical prediction */

-  B_HE_PRED,          /* horizontal prediction */

+#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \

+                            (c)->fc.tx_probs_8x8p :    \

+                            (b) < BLOCK_SIZE_SB32X32 ? \

+                            (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)

-  B_LD_PRED,

-  B_RD_PRED,

-  B_VR_PRED,

-  B_VL_PRED,

-  B_HD_PRED,

-  B_HU_PRED,

-#if CONFIG_NEWBINTRAMODES

-  B_CONTEXT_PRED,

-#endif

-  LEFT4X4,

-  ABOVE4X4,

-  ZERO4X4,

-  NEW4X4,

-  B_MODE_COUNT

-} B_PREDICTION_MODE;

-#define VP9_BINTRAMODES (LEFT4X4)

-#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)

-#if CONFIG_NEWBINTRAMODES

-/* The number of B_PRED intra modes that are replaced by B_CONTEXT_PRED */

-#define CONTEXT_PRED_REPLACEMENTS  0

-#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES - 1)

-#define VP9_NKF_BINTRAMODES  (VP9_BINTRAMODES - CONTEXT_PRED_REPLACEMENTS)

-#else

-#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES)   /* 10 */

-#define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES)  /* 10 */

-#endif

-typedef enum {

-  PARTITIONING_16X8 = 0,

-  PARTITIONING_8X16,

-  PARTITIONING_8X8,

-  PARTITIONING_4X4,

-  NB_PARTITIONINGS,

-} SPLITMV_PARTITIONING_TYPE;

 /* For keyframes, intra block modes are predicted by the (already decoded)

    modes for the Y blocks to the left and above us; for interframes, there

    is a single probability table. */

@@ -191,11 +135,7 @@

 union b_mode_info {

   struct {

-    B_PREDICTION_MODE first;

-    TX_TYPE           tx_type;

-#if CONFIG_NEWBINTRAMODES

-    B_PREDICTION_MODE context;

-#endif

+    MB_PREDICTION_MODE first;

   } as_mode;

   int_mv as_mv[2];  // first, second inter predictor motion vectors

};

@@ -209,37 +149,80 @@

   MAX_REF_FRAMES = 4

 } MV_REFERENCE_FRAME;

-typedef enum {

-  BLOCK_SIZE_MB16X16 = 0,

-  BLOCK_SIZE_SB32X32 = 1,

-  BLOCK_SIZE_SB64X64 = 2,

-} BLOCK_SIZE_TYPE;

+static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {

+  switch (sb_type) {

+    case BLOCK_SIZE_SB4X8:

+    case BLOCK_SIZE_AB4X4: return 0;

+    case BLOCK_SIZE_SB8X4:

+    case BLOCK_SIZE_SB8X8:

+    case BLOCK_SIZE_SB8X16: return 1;

+    case BLOCK_SIZE_SB16X8:

+    case BLOCK_SIZE_MB16X16:

+    case BLOCK_SIZE_SB16X32: return 2;

+    case BLOCK_SIZE_SB32X16:

+    case BLOCK_SIZE_SB32X32:

+    case BLOCK_SIZE_SB32X64: return 3;

+    case BLOCK_SIZE_SB64X32:

+    case BLOCK_SIZE_SB64X64: return 4;

+    default: assert(0);

+      return -1;

+  }

+}

+static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {

+  switch (sb_type) {

+    case BLOCK_SIZE_SB8X4:

+    case BLOCK_SIZE_AB4X4: return 0;

+    case BLOCK_SIZE_SB4X8:

+    case BLOCK_SIZE_SB8X8:

+    case BLOCK_SIZE_SB16X8: return 1;

+    case BLOCK_SIZE_SB8X16:

+    case BLOCK_SIZE_MB16X16:

+    case BLOCK_SIZE_SB32X16: return 2;

+    case BLOCK_SIZE_SB16X32:

+    case BLOCK_SIZE_SB32X32:

+    case BLOCK_SIZE_SB64X32: return 3;

+    case BLOCK_SIZE_SB32X64:

+    case BLOCK_SIZE_SB64X64: return 4;

+    default: assert(0);

+      return -1;

+  }

+}

+static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {

+  int a = b_width_log2(sb_type) - 1;

+  // align 4x4 block to mode_info

+  if (a < 0)

+    a = 0;

+  assert(a >= 0);

+  return a;

+}

+static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {

+  int a = b_height_log2(sb_type) - 1;

+  if (a < 0)

+    a = 0;

+  assert(a >= 0);

+  return a;

+}

 typedef struct {

   MB_PREDICTION_MODE mode, uv_mode;

-#if CONFIG_COMP_INTERINTRA_PRED

-  MB_PREDICTION_MODE interintra_mode, interintra_uv_mode;

-#endif

-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;

+  MV_REFERENCE_FRAME ref_frame[2];

   TX_SIZE txfm_size;

   int_mv mv[2]; // for each reference frame used

   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];

   int_mv best_mv, best_second_mv;

-#if CONFIG_NEW_MVREF

-  int best_index, best_second_index;

-#endif

   int mb_mode_context[MAX_REF_FRAMES];

-  SPLITMV_PARTITIONING_TYPE partitioning;

   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */

   unsigned char need_to_clamp_mvs;

   unsigned char need_to_clamp_secondmv;

-  unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */

+  unsigned char segment_id;           // Segment id for current frame

   // Flags used for prediction status of various bistream signals

   unsigned char seg_id_predicted;

-  unsigned char ref_predicted;

   // Indicates if the mb is part of the image (1) vs border (0)

   // This can be useful in determining whether the MB provides

@@ -249,69 +232,62 @@

   INTERPOLATIONFILTERTYPE interp_filter;

   BLOCK_SIZE_TYPE sb_type;

-#if CONFIG_CODE_NONZEROCOUNT

-  uint16_t nzcs[256+64*2];

-#endif

 } MB_MODE_INFO;

 typedef struct {

   MB_MODE_INFO mbmi;

-  union b_mode_info bmi[16];

+  union b_mode_info bmi[4];

 } MODE_INFO;

-typedef struct blockd {

-  int16_t *qcoeff;

-  int16_t *dqcoeff;

-  uint8_t *predictor;

-  int16_t *diff;

-  int16_t *dequant;

-  /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */

-  uint8_t **base_pre;

-  uint8_t **base_second_pre;

-  int pre;

-  int pre_stride;

-  uint8_t **base_dst;

-  int dst;

-  int dst_stride;

-  union b_mode_info bmi;

-} BLOCKD;

+#define VP9_REF_SCALE_SHIFT 14

 struct scale_factors {

-  int x_num;

-  int x_den;

+  int x_scale_fp;   // horizontal fixed point scale factor

+  int y_scale_fp;   // vertical fixed point scale factor

   int x_offset_q4;

   int x_step_q4;

-  int y_num;

-  int y_den;

   int y_offset_q4;

   int y_step_q4;

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-  convolve_fn_t predict[2][2][8];  // horiz, vert, weight (0 - 7)

-#else

+  int (*scale_value_x)(int val, const struct scale_factors *scale);

+  int (*scale_value_y)(int val, const struct scale_factors *scale);

+  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);

+  int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv,

+                                const struct scale_factors *scale);

+  int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4);

   convolve_fn_t predict[2][2][2];  // horiz, vert, avg

-#endif

};

-typedef struct macroblockd {

-  DECLARE_ALIGNED(16, int16_t,  diff[64*64+32*32*2]);      /* from idct diff */

-  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);  // unused for superblocks

-  DECLARE_ALIGNED(16, int16_t,  qcoeff[64*64+32*32*2]);

-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64*64+32*32*2]);

-  DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]);

-#if CONFIG_CODE_NONZEROCOUNT

-  DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]);

+#if CONFIG_ALPHA

+enum { MAX_MB_PLANE = 4 };

+#else

+enum { MAX_MB_PLANE = 3 };

 #endif

-  /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */

-  BLOCKD block[24];

-  int fullpixel_mask;

+struct buf_2d {

+  uint8_t *buf;

+  int stride;

+};

-  YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */

-  YV12_BUFFER_CONFIG second_pre;

-  YV12_BUFFER_CONFIG dst;

+struct macroblockd_plane {

+  DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);

+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);

+  DECLARE_ALIGNED(16, uint16_t, eobs[256]);

+  PLANE_TYPE plane_type;

+  int subsampling_x;

+  int subsampling_y;

+  struct buf_2d dst;

+  struct buf_2d pre[2];

+  int16_t *dequant;

+  ENTROPY_CONTEXT *above_context;

+  ENTROPY_CONTEXT *left_context;

+};

+#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))

+typedef struct macroblockd {

+  struct macroblockd_plane plane[MAX_MB_PLANE];

   struct scale_factors scale_factor[2];

   struct scale_factors scale_factor_uv[2];

@@ -325,11 +301,11 @@

   int left_available;

   int right_available;

-  /* Y,U,V */

-  ENTROPY_CONTEXT_PLANES *above_context;

-  ENTROPY_CONTEXT_PLANES *left_context;

+  // partition contexts

+  PARTITION_CONTEXT *above_seg_context;

+  PARTITION_CONTEXT *left_seg_context;

-  /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */

+  /* 0 (disable) 1 (enable) segmentation */

   unsigned char segmentation_enabled;

   /* 0 (do not update) 1 (update) the macroblock segmentation map. */

@@ -345,15 +321,10 @@

   /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */

   // Probability Tree used to code Segment number

-  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];

-  vp9_prob mb_segment_mispred_tree_probs[MAX_MB_SEGMENTS];

+  vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS];

-#if CONFIG_NEW_MVREF

-  vp9_prob mb_mv_ref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];

-#endif

   // Segment features

-  signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];

+  int16_t segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];

   unsigned int segment_feature_mask[MAX_MB_SEGMENTS];

   /* mode_based Loop filter adjustment */

@@ -361,10 +332,14 @@

   unsigned char mode_ref_lf_delta_update;

   /* Delta values have the range +/- MAX_LOOP_FILTER */

-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */

-  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */

-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];              /* 0 = BPRED, ZERO_MV, MV, SPLIT */

-  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                   /* 0 = BPRED, ZERO_MV, MV, SPLIT */

+  /* 0 = Intra, Last, GF, ARF */

+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];

+  /* 0 = Intra, Last, GF, ARF */

+  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];

+  /* 0 = ZERO_MV, MV */

+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];

+  /* 0 = ZERO_MV, MV */

+  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];

   /* Distance of MB away from frame edges */

   int mb_to_left_edge;

@@ -377,15 +352,13 @@

   int lossless;

   /* Inverse transform function pointers. */

-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);

-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);

-  void (*itxm_add)(int16_t *input, const int16_t *dq,

-    uint8_t *pred, uint8_t *output, int pitch, int stride, int eob);

-  void (*itxm_add_y_block)(int16_t *q, const int16_t *dq,

-    uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd);

-  void (*itxm_add_uv_block)(int16_t *q, const int16_t *dq,

-    uint8_t *pre, uint8_t *dst_u, uint8_t *dst_v, int stride,

+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);

+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);

+  void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);

+  void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,

     struct macroblockd *xd);

+  void (*itxm_add_uv_block)(int16_t *q, uint8_t *dst, int stride,

+    uint16_t *eobs);

   struct subpix_fn_table  subpix;

@@ -393,212 +366,187 @@

   int corrupted;

-  int sb_index;

-  int mb_index;   // Index of the MB in the SB (0..3)

+  int sb_index;   // index of 32x32 block inside the 64x64 block

+  int mb_index;   // index of 16x16 block inside the 32x32 block

+  int b_index;    // index of 8x8 block inside the 16x16 block

+  int ab_index;   // index of 4x4 block inside the 8x8 block

   int q_index;

 } MACROBLOCKD;

-#define ACTIVE_HT   110                // quantization stepsize threshold

+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {

+  switch (subsize) {

+    case BLOCK_SIZE_SB64X64:

+    case BLOCK_SIZE_SB64X32:

+    case BLOCK_SIZE_SB32X64:

+    case BLOCK_SIZE_SB32X32:

+      return &xd->sb_index;

+    case BLOCK_SIZE_SB32X16:

+    case BLOCK_SIZE_SB16X32:

+    case BLOCK_SIZE_MB16X16:

+      return &xd->mb_index;

+    case BLOCK_SIZE_SB16X8:

+    case BLOCK_SIZE_SB8X16:

+    case BLOCK_SIZE_SB8X8:

+      return &xd->b_index;

+    case BLOCK_SIZE_SB8X4:

+    case BLOCK_SIZE_SB4X8:

+    case BLOCK_SIZE_AB4X4:

+      return &xd->ab_index;

+    default:

+      assert(0);

+      return NULL;

+  }

+}

-#define ACTIVE_HT8  300

+static INLINE void update_partition_context(MACROBLOCKD *xd,

+                                            BLOCK_SIZE_TYPE sb_type,

+                                            BLOCK_SIZE_TYPE sb_size) {

+  int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;

+  int bwl = b_width_log2(sb_type);

+  int bhl = b_height_log2(sb_type);

+  int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;

+  int i;

-#define ACTIVE_HT16 300

+  // update the partition context at the end notes. set partition bits

+  // of block sizes larger than the current one to be one, and partition

+  // bits of smaller block sizes to be zero.

+  if ((bwl == bsl) && (bhl == bsl)) {

+    for (i = 0; i < bs; i++)

+      xd->left_seg_context[i] = ~(0xf << boffset);

+    for (i = 0; i < bs; i++)

+      xd->above_seg_context[i] = ~(0xf << boffset);

+  } else if ((bwl == bsl) && (bhl < bsl)) {

+    for (i = 0; i < bs; i++)

+      xd->left_seg_context[i] = ~(0xe << boffset);

+    for (i = 0; i < bs; i++)

+      xd->above_seg_context[i] = ~(0xf << boffset);

+  }  else if ((bwl < bsl) && (bhl == bsl)) {

+    for (i = 0; i < bs; i++)

+      xd->left_seg_context[i] = ~(0xf << boffset);

+    for (i = 0; i < bs; i++)

+      xd->above_seg_context[i] = ~(0xe << boffset);

+  } else if ((bwl < bsl) && (bhl < bsl)) {

+    for (i = 0; i < bs; i++)

+      xd->left_seg_context[i] = ~(0xe << boffset);

+    for (i = 0; i < bs; i++)

+      xd->above_seg_context[i] = ~(0xe << boffset);

+  } else {

+    assert(0);

+  }

+}

-// convert MB_PREDICTION_MODE to B_PREDICTION_MODE

-static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {

-  switch (mode) {

-    case DC_PRED: return B_DC_PRED;

-    case V_PRED: return B_VE_PRED;

-    case H_PRED: return B_HE_PRED;

-    case TM_PRED: return B_TM_PRED;

-    case D45_PRED: return B_LD_PRED;

-    case D135_PRED: return B_RD_PRED;

-    case D117_PRED: return B_VR_PRED;

-    case D153_PRED: return B_HD_PRED;

-    case D27_PRED: return B_HU_PRED;

-    case D63_PRED: return B_VL_PRED;

+static INLINE int partition_plane_context(MACROBLOCKD *xd,

+                                          BLOCK_SIZE_TYPE sb_type) {

+  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;

+  int above = 0, left = 0, i;

+  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;

+  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));

+  assert(bsl >= 0);

+  assert(boffset >= 0);

+  for (i = 0; i < bs; i++)

+    above |= (xd->above_seg_context[i] & (1 << boffset));

+  for (i = 0; i < bs; i++)

+    left |= (xd->left_seg_context[i] & (1 << boffset));

+  above = (above > 0);

+  left  = (left > 0);

+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;

+}

+static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,

+                                   PARTITION_TYPE partition) {

+  BLOCK_SIZE_TYPE subsize;

+  switch (partition) {

+    case PARTITION_NONE:

+      subsize = bsize;

+      break;

+    case PARTITION_HORZ:

+      if (bsize == BLOCK_SIZE_SB64X64)

+        subsize = BLOCK_SIZE_SB64X32;

+      else if (bsize == BLOCK_SIZE_SB32X32)

+        subsize = BLOCK_SIZE_SB32X16;

+      else if (bsize == BLOCK_SIZE_MB16X16)

+        subsize = BLOCK_SIZE_SB16X8;

+      else if (bsize == BLOCK_SIZE_SB8X8)

+        subsize = BLOCK_SIZE_SB8X4;

+      else

+        assert(0);

+      break;

+    case PARTITION_VERT:

+      if (bsize == BLOCK_SIZE_SB64X64)

+        subsize = BLOCK_SIZE_SB32X64;

+      else if (bsize == BLOCK_SIZE_SB32X32)

+        subsize = BLOCK_SIZE_SB16X32;

+      else if (bsize == BLOCK_SIZE_MB16X16)

+        subsize = BLOCK_SIZE_SB8X16;

+      else if (bsize == BLOCK_SIZE_SB8X8)

+        subsize = BLOCK_SIZE_SB4X8;

+      else

+        assert(0);

+      break;

+    case PARTITION_SPLIT:

+      if (bsize == BLOCK_SIZE_SB64X64)

+        subsize = BLOCK_SIZE_SB32X32;

+      else if (bsize == BLOCK_SIZE_SB32X32)

+        subsize = BLOCK_SIZE_MB16X16;

+      else if (bsize == BLOCK_SIZE_MB16X16)

+        subsize = BLOCK_SIZE_SB8X8;

+      else if (bsize == BLOCK_SIZE_SB8X8)

+        subsize = BLOCK_SIZE_AB4X4;

+      else

+        assert(0);

+      break;

     default:

-       assert(0);

-       return B_MODE_COUNT;  // Dummy value

+      assert(0);

+  return subsize;

 // transform mapping

-static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {

+static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {

   switch (bmode) {

-    case B_TM_PRED :

-    case B_RD_PRED :

+    case TM_PRED :

+    case D135_PRED :

       return ADST_ADST;

-    case B_VE_PRED :

-    case B_VR_PRED :

+    case V_PRED :

+    case D117_PRED :

+    case D63_PRED:

       return ADST_DCT;

-    case B_HE_PRED :

-    case B_HD_PRED :

-    case B_HU_PRED :

+    case H_PRED :

+    case D153_PRED :

+    case D27_PRED :

       return DCT_ADST;

-#if CONFIG_NEWBINTRAMODES

-    case B_CONTEXT_PRED:

-      assert(0);

-      break;

-#endif

     default:

       return DCT_DCT;

-extern const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24];

-extern const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24];

-extern const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96];

-extern const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96];

-extern const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384];

-extern const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384];

-#define USE_ADST_FOR_I16X16_8X8   1

-#define USE_ADST_FOR_I16X16_4X4   1

-#define USE_ADST_FOR_I8X8_4X4     1

-#define USE_ADST_PERIPHERY_ONLY   1

-#define USE_ADST_FOR_SB           1

-#define USE_ADST_FOR_REMOTE_EDGE  0

 static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {

-  // TODO(debargha): explore different patterns for ADST usage when blocksize

-  // is smaller than the prediction size

-  TX_TYPE tx_type = DCT_DCT;

-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-#if !USE_ADST_FOR_SB

-  if (sb_type)

-    return tx_type;

-#endif

-  if (ib >= (16 << (2 * sb_type)))  // no chroma adst

-    return tx_type;

-  if (xd->lossless)

+  TX_TYPE tx_type;

+  MODE_INFO *mi = xd->mode_info_context;

+  MB_MODE_INFO *const mbmi = &mi->mbmi;

+  if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)

     return DCT_DCT;

-  if (xd->mode_info_context->mbmi.mode == B_PRED &&

-      xd->q_index < ACTIVE_HT) {

-    const BLOCKD *b = &xd->block[ib];

-    tx_type = txfm_map(

-#if CONFIG_NEWBINTRAMODES

-        b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context :

-#endif

-        b->bmi.as_mode.first);

-  } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&

-             xd->q_index < ACTIVE_HT) {

-    const BLOCKD *b = &xd->block[ib];

-    const int ic = (ib & 10);

-#if USE_ADST_FOR_I8X8_4X4

-#if USE_ADST_PERIPHERY_ONLY

-    // Use ADST for periphery blocks only

-    const int inner = ib & 5;

-    b += ic - ib;

-    tx_type = txfm_map(pred_mode_conv(

-        (MB_PREDICTION_MODE)b->bmi.as_mode.first));

-#if USE_ADST_FOR_REMOTE_EDGE

-    if (inner == 5)

-      tx_type = DCT_DCT;

-#else

-    if (inner == 1) {

-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;

-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

-    } else if (inner == 4) {

-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;

-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

-    } else if (inner == 5) {

-      tx_type = DCT_DCT;

-    }

-#endif

-#else

-    // Use ADST

-    b += ic - ib;

-    tx_type = txfm_map(pred_mode_conv(

-        (MB_PREDICTION_MODE)b->bmi.as_mode.first));

-#endif

-#else

-    // Use 2D DCT

-    tx_type = DCT_DCT;

-#endif

-  } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&

-             xd->q_index < ACTIVE_HT) {

-#if USE_ADST_FOR_I16X16_4X4

-#if USE_ADST_PERIPHERY_ONLY

-    const int hmax = 4 << sb_type;

-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

-#if USE_ADST_FOR_REMOTE_EDGE

-    if ((ib & (hmax - 1)) != 0 && ib >= hmax)

-      tx_type = DCT_DCT;

-#else

-    if (ib >= 1 && ib < hmax) {

-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;

-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

-    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {

-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;

-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

-    } else if (ib != 0) {

-      tx_type = DCT_DCT;

-    }

-#endif

-#else

-    // Use ADST

-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

-#endif

-#else

-    // Use 2D DCT

-    tx_type = DCT_DCT;

-#endif

+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {

+    tx_type = txfm_map(mi->bmi[ib].as_mode.first);

+  } else {

+    assert(mbmi->mode <= TM_PRED);

+    tx_type = txfm_map(mbmi->mode);

   return tx_type;

 static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {

-  // TODO(debargha): explore different patterns for ADST usage when blocksize

-  // is smaller than the prediction size

   TX_TYPE tx_type = DCT_DCT;

-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-#if !USE_ADST_FOR_SB

-  if (sb_type)

-    return tx_type;

-#endif

-  if (ib >= (16 << (2 * sb_type)))  // no chroma adst

-    return tx_type;

-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&

-      xd->q_index < ACTIVE_HT8) {

-    const BLOCKD *b = &xd->block[ib];

-    // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged

-    // or the relationship otherwise modified to address this type conversion.

-    tx_type = txfm_map(pred_mode_conv(

-           (MB_PREDICTION_MODE)b->bmi.as_mode.first));

-  } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&

-             xd->q_index < ACTIVE_HT8) {

-#if USE_ADST_FOR_I16X16_8X8

-#if USE_ADST_PERIPHERY_ONLY

-    const int hmax = 4 << sb_type;

-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

-#if USE_ADST_FOR_REMOTE_EDGE

-    if ((ib & (hmax - 1)) != 0 && ib >= hmax)

-      tx_type = DCT_DCT;

-#else

-    if (ib >= 1 && ib < hmax) {

-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;

-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

-    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {

-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;

-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

-    } else if (ib != 0) {

-      tx_type = DCT_DCT;

-    }

-#endif

-#else

-    // Use ADST

-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

-#endif

-#else

-    // Use 2D DCT

-    tx_type = DCT_DCT;

-#endif

+  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {

+    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);

   return tx_type;

@@ -605,71 +553,358 @@

 static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {

   TX_TYPE tx_type = DCT_DCT;

-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-#if !USE_ADST_FOR_SB

-  if (sb_type)

-    return tx_type;

-#endif

-  if (ib >= (16 << (2 * sb_type)))

-    return tx_type;

-  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&

-      xd->q_index < ACTIVE_HT16) {

-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

-#if USE_ADST_PERIPHERY_ONLY

-    if (sb_type) {

-      const int hmax = 4 << sb_type;

-#if USE_ADST_FOR_REMOTE_EDGE

-      if ((ib & (hmax - 1)) != 0 && ib >= hmax)

-        tx_type = DCT_DCT;

-#else

-      if (ib >= 1 && ib < hmax) {

-        if (tx_type == ADST_ADST) tx_type = ADST_DCT;

-        else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

-      } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {

-        if (tx_type == ADST_ADST) tx_type = DCT_ADST;

-        else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

-      } else if (ib != 0) {

-        tx_type = DCT_DCT;

+  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {

+    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);

+  }

+  return tx_type;

+}

+void vp9_setup_block_dptrs(MACROBLOCKD *xd,

+                           int subsampling_x, int subsampling_y);

+static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {

+  const TX_SIZE size = mbmi->txfm_size;

+  switch (mbmi->sb_type) {

+    case BLOCK_SIZE_SB64X64:

+      return size;

+    case BLOCK_SIZE_SB64X32:

+    case BLOCK_SIZE_SB32X64:

+    case BLOCK_SIZE_SB32X32:

+      if (size == TX_32X32)

+        return TX_16X16;

+      else

+        return size;

+    case BLOCK_SIZE_SB32X16:

+    case BLOCK_SIZE_SB16X32:

+    case BLOCK_SIZE_MB16X16:

+      if (size == TX_16X16)

+        return TX_8X8;

+      else

+        return size;

+    default:

+      return TX_4X4;

+  }

+  return size;

+}

+struct plane_block_idx {

+  int plane;

+  int block;

+};

+// TODO(jkoleszar): returning a struct so it can be used in a const context,

+// expect to refactor this further later.

+static INLINE struct plane_block_idx plane_block_idx(int y_blocks,

+                                                     int b_idx) {

+  const int v_offset = y_blocks * 5 / 4;

+  struct plane_block_idx res;

+  if (b_idx < y_blocks) {

+    res.plane = 0;

+    res.block = b_idx;

+  } else if (b_idx < v_offset) {

+    res.plane = 1;

+    res.block = b_idx - y_blocks;

+  } else {

+    assert(b_idx < y_blocks * 3 / 2);

+    res.plane = 2;

+    res.block = b_idx - v_offset;

+  }

+  return res;

+}

+static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,

+                                    const struct macroblockd_plane* plane) {

+  return 4 << (b_width_log2(bsize) - plane->subsampling_x);

+}

+static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,

+                                     const struct macroblockd_plane* plane) {

+  return 4 << (b_height_log2(bsize) - plane->subsampling_y);

+}

+typedef void (*foreach_transformed_block_visitor)(int plane, int block,

+                                                  BLOCK_SIZE_TYPE bsize,

+                                                  int ss_txfrm_size,

+                                                  void *arg);

+static INLINE void foreach_transformed_block_in_plane(

+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,

+    foreach_transformed_block_visitor visit, void *arg) {

+  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);

+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")

+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8

+  // transform size varies per plane, look it up in a common way.

+  const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;

+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)

+                                : mbmi->txfm_size;

+  const int block_size_b = bw + bh;

+  const int txfrm_size_b = tx_size * 2;

+  // subsampled size of the block

+  const int ss_sum = xd->plane[plane].subsampling_x

+      + xd->plane[plane].subsampling_y;

+  const int ss_block_size = block_size_b - ss_sum;

+  const int step = 1 << txfrm_size_b;

+  int i;

+  assert(txfrm_size_b <= block_size_b);

+  assert(txfrm_size_b <= ss_block_size);

+  // If mb_to_right_edge is < 0 we are in a situation in which

+  // the current block size extends into the UMV and we won't

+  // visit the sub blocks that are wholly within the UMV.

+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {

+    int r, c;

+    const int sw = bw - xd->plane[plane].subsampling_x;

+    const int sh = bh - xd->plane[plane].subsampling_y;

+    int max_blocks_wide = 1 << sw;

+    int max_blocks_high = 1 << sh;

+    // xd->mb_to_right_edge is in units of pixels * 8.  This converts

+    // it to 4x4 block sizes.

+    if (xd->mb_to_right_edge < 0)

+      max_blocks_wide +=

+          + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));

+    if (xd->mb_to_bottom_edge < 0)

+      max_blocks_high +=

+          + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));

+    i = 0;

+    // Unlike the normal case - in here we have to keep track of the

+    // row and column of the blocks we use so that we know if we are in

+    // the unrestricted motion border..

+    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {

+      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {

+        if (r < max_blocks_high && c < max_blocks_wide)

+          visit(plane, i, bsize, txfrm_size_b, arg);

+        i += step;

-#endif

-#endif

+  } else {

+    for (i = 0; i < (1 << ss_block_size); i += step) {

+      visit(plane, i, bsize, txfrm_size_b, arg);

+    }

-  return tx_type;

-void vp9_build_block_doffsets(MACROBLOCKD *xd);

-void vp9_setup_block_dptrs(MACROBLOCKD *xd);

+static INLINE void foreach_transformed_block(

+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,

+    foreach_transformed_block_visitor visit, void *arg) {

+  int plane;

-static void update_blockd_bmi(MACROBLOCKD *xd) {

-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;

+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {

+    foreach_transformed_block_in_plane(xd, bsize, plane,

+                                       visit, arg);

+  }

+}

-  if (mode == SPLITMV || mode == I8X8_PRED || mode == B_PRED) {

-    int i;

-    for (i = 0; i < 16; i++)

-      xd->block[i].bmi = xd->mode_info_context->bmi[i];

+static INLINE void foreach_transformed_block_uv(

+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,

+    foreach_transformed_block_visitor visit, void *arg) {

+  int plane;

+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {

+    foreach_transformed_block_in_plane(xd, bsize, plane,

+                                       visit, arg);

-static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {

-  TX_SIZE tx_size_uv;

-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-    tx_size_uv = xd->mode_info_context->mbmi.txfm_size;

-  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

-    if (xd->mode_info_context->mbmi.txfm_size == TX_32X32)

-      tx_size_uv = TX_16X16;

-    else

-      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;

+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could

+// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for

+// sizes smaller than 16x16 yet.

+typedef void (*foreach_predicted_block_visitor)(int plane, int block,

+                                                BLOCK_SIZE_TYPE bsize,

+                                                int pred_w, int pred_h,

+                                                void *arg);

+static INLINE void foreach_predicted_block_in_plane(

+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,

+    foreach_predicted_block_visitor visit, void *arg) {

+  int i, x, y;

+  // block sizes in number of 4x4 blocks log 2 ("*_b")

+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8

+  // subsampled size of the block

+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;

+  // size of the predictor to use.

+  int pred_w, pred_h;

+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {

+    assert(bsize == BLOCK_SIZE_SB8X8);

+    pred_w = 0;

+    pred_h = 0;

   } else {

-    if (xd->mode_info_context->mbmi.txfm_size == TX_16X16)

-      tx_size_uv = TX_8X8;

-    else if (xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&

-             (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-              xd->mode_info_context->mbmi.mode == SPLITMV))

-      tx_size_uv = TX_4X4;

-    else

-      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;

+    pred_w = bwl;

+    pred_h = bhl;

-  return tx_size_uv;

+  assert(pred_w <= bwl);

+  assert(pred_h <= bhl);

+  // visit each subblock in raster order

+  i = 0;

+  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {

+    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {

+      visit(plane, i, bsize, pred_w, pred_h, arg);

+      i += 1 << pred_w;

+    }

+    i += (1 << (bwl + pred_h)) - (1 << bwl);

+  }

+static INLINE void foreach_predicted_block(

+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,

+    foreach_predicted_block_visitor visit, void *arg) {

+  int plane;

+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {

+    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);

+  }

+}

+static INLINE void foreach_predicted_block_uv(

+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,

+    foreach_predicted_block_visitor visit, void *arg) {

+  int plane;

+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {

+    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);

+  }

+}

+static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,

+                               int plane, int block, int stride) {

+  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));

+  return y * stride + x;

+}

+static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,

+                                         BLOCK_SIZE_TYPE bsize,

+                                         int plane, int block, int16_t *base) {

+  const int stride = plane_block_width(bsize, &xd->plane[plane]);

+  return base + raster_block_offset(xd, bsize, plane, block, stride);

+}

+static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd,

+                                         BLOCK_SIZE_TYPE bsize,

+                                         int plane, int block,

+                                         uint8_t *base, int stride) {

+  return base + raster_block_offset(xd, bsize, plane, block, stride);

+}

+static int txfrm_block_to_raster_block(MACROBLOCKD *xd,

+                                       BLOCK_SIZE_TYPE bsize,

+                                       int plane, int block,

+                                       int ss_txfrm_size) {

+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  const int txwl = ss_txfrm_size / 2;

+  const int tx_cols_lg2 = bwl - txwl;

+  const int tx_cols = 1 << tx_cols_lg2;

+  const int raster_mb = block >> ss_txfrm_size;

+  const int x = (raster_mb & (tx_cols - 1)) << (txwl);

+  const int y = raster_mb >> tx_cols_lg2 << (txwl);

+  return x + (y << bwl);

+}

+static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,

+                                     BLOCK_SIZE_TYPE bsize,

+                                     int plane, int block,

+                                     int ss_txfrm_size,

+                                     int *x, int *y) {

+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  const int txwl = ss_txfrm_size / 2;

+  const int tx_cols_lg2 = bwl - txwl;

+  const int tx_cols = 1 << tx_cols_lg2;

+  const int raster_mb = block >> ss_txfrm_size;

+  *x = (raster_mb & (tx_cols - 1)) << (txwl);

+  *y = raster_mb >> tx_cols_lg2 << (txwl);

+}

+static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,

+                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {

+  const int bw = plane_block_width(bsize, &xd->plane[plane]);

+  const int bh = plane_block_height(bsize, &xd->plane[plane]);

+  int x, y;

+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);

+  x = x * 4 - 1;

+  y = y * 4 - 1;

+  // Copy a pixel into the umv if we are in a situation where the block size

+  // extends into the UMV.

+  // TODO(JBB): Should be able to do the full extend in place so we don't have

+  // to do this multiple times.

+  if (xd->mb_to_right_edge < 0) {

+    int umv_border_start = bw

+        + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x));

+    if (x + bw > umv_border_start)

+      vpx_memset(

+          xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride

+              + umv_border_start,

+          *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride

+              + umv_border_start - 1),

+          bw);

+  }

+  if (xd->mb_to_bottom_edge < 0) {

+    int umv_border_start = bh

+        + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y));

+    int i;

+    uint8_t c = *(xd->plane[plane].dst.buf

+        + (umv_border_start - 1) * xd->plane[plane].dst.stride + x);

+    uint8_t *d = xd->plane[plane].dst.buf

+        + umv_border_start * xd->plane[plane].dst.stride + x;

+    if (y + bh > umv_border_start)

+      for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride)

+        *d = c;

+  }

+}

+static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,

+                                   int plane, int ss_tx_size, int eob, int aoff,

+                                   int loff, ENTROPY_CONTEXT *A,

+                                   ENTROPY_CONTEXT *L) {

+  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);

+  const int sw = bw - xd->plane[plane].subsampling_x;

+  const int sh = bh - xd->plane[plane].subsampling_y;

+  int mi_blocks_wide = 1 << sw;

+  int mi_blocks_high = 1 << sh;

+  int tx_size_in_blocks = (1 << ss_tx_size);

+  int above_contexts = tx_size_in_blocks;

+  int left_contexts = tx_size_in_blocks;

+  int pt;

+  // xd->mb_to_right_edge is in units of pixels * 8.  This converts

+  // it to 4x4 block sizes.

+  if (xd->mb_to_right_edge < 0) {

+    mi_blocks_wide += (xd->mb_to_right_edge

+        >> (5 + xd->plane[plane].subsampling_x));

+  }

+  // this code attempts to avoid copying into contexts that are outside

+  // our border.  Any blocks that do are set to 0...

+  if (above_contexts + aoff > mi_blocks_wide)

+    above_contexts = mi_blocks_wide - aoff;

+  if (xd->mb_to_bottom_edge < 0) {

+    mi_blocks_high += (xd->mb_to_bottom_edge

+        >> (5 + xd->plane[plane].subsampling_y));

+  }

+  if (left_contexts + loff > mi_blocks_high) {

+    left_contexts = mi_blocks_high - loff;

+  }

+  for (pt = 0; pt < above_contexts; pt++)

+    A[pt] = eob > 0;

+  for (pt = above_contexts; pt < (1 << ss_tx_size); pt++)

+    A[pt] = 0;

+  for (pt = 0; pt < left_contexts; pt++)

+    L[pt] = eob > 0;

+  for (pt = left_contexts; pt < (1 << ss_tx_size); pt++)

+    L[pt] = 0;

+}

 #endif  // VP9_COMMON_VP9_BLOCKD_H_

--- a/vp9/common/vp9_coefupdateprobs.h

+++ b/vp9/common/vp9_coefupdateprobs.h

@@ -14,20 +14,8 @@

 /* Update probabilities for the nodes in the token entropy tree.

    Generated file included by vp9_entropy.c */

-static const vp9_prob vp9_coef_update_prob[ENTROPY_NODES] = {

-  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252

+static const vp9_prob vp9_coef_update_prob[UNCONSTRAINED_NODES] = {

+  252, 252, 252,

};

-#if CONFIG_CODE_NONZEROCOUNT

-#define NZC_UPDATE_PROB_4X4     252

-#define NZC_UPDATE_PROB_8X8     252

-#define NZC_UPDATE_PROB_16X16   252

-#define NZC_UPDATE_PROB_32X32   252

-#define NZC_UPDATE_PROB_PCAT    252

-#endif

-#if CONFIG_MODELCOEFPROB

-#define COEF_MODEL_UPDATE_PROB   16

-#endif

 #endif  // VP9_COMMON_VP9_COEFUPDATEPROBS_H__

--- a/vp9/common/vp9_common.h

+++ b/vp9/common/vp9_common.h

@@ -19,9 +19,6 @@

 #include "vpx_mem/vpx_mem.h"

 #include "vpx/vpx_integer.h"

-#define TRUE    1

-#define FALSE   0

 #define MIN(x, y) (((x) < (y)) ? (x) : (y))

 #define MAX(x, y) (((x) > (y)) ? (x) : (y))

@@ -54,5 +51,18 @@

 static INLINE int clamp(int value, int low, int high) {

   return value < low ? low : (value > high ? high : value);

+static INLINE double fclamp(double value, double low, double high) {

+  return value < low ? low : (value > high ? high : value);

+}

+static INLINE int multiple8(int value) {

+  return (value + 7) & ~7;

+}

+#define SYNC_CODE_0 0x49

+#define SYNC_CODE_1 0x83

+#define SYNC_CODE_2 0x42

 #endif  // VP9_COMMON_VP9_COMMON_H_

--- a/vp9/common/vp9_context.c

+++ /dev/null

@@ -1,397 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9/common/vp9_entropy.h"

-/* *** GENERATED FILE: DO NOT EDIT *** */

-#if 0

-int Contexts[vp8_coef_counter_dimen];

-const int default_contexts[vp8_coef_counter_dimen] = {

-  {

-    // Block Type ( 0 )

-    {

-      // Coeff Band ( 0 )

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-    },

-    {

-      // Coeff Band ( 1 )

-      {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593},

-      {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987},

-      {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104},

-    },

-    {

-      // Coeff Band ( 2 )

-      {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0},

-      {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294},

-      {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879},

-    },

-    {

-      // Coeff Band ( 3 )

-      {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0},

-      {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302},

-      { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611},

-    },

-    {

-      // Coeff Band ( 4 )

-      {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0},

-      {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073},

-      { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50},

-    },

-    {

-      // Coeff Band ( 5 )

-      {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0},

-      {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362},

-      { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190},

-    },

-    {

-      // Coeff Band ( 6 )

-      {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0},

-      {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164},

-      { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8},

-    },

-  },

-  {

-    // Block Type ( 1 )

-    {

-      // Coeff Band ( 0 )

-      {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289},

-      {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914},

-      {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620},

-    },

-    {

-      // Coeff Band ( 1 )

-      {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0},

-      {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988},

-      {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136},

-    },

-    {

-      // Coeff Band ( 2 )

-      {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0},

-      {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980},

-      {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429},

-    },

-    {

-      // Coeff Band ( 3 )

-      {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0},

-      {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820},

-      {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679},

-    },

-    {

-      // Coeff Band ( 4 )

-      {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0},

-      {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127},

-      { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101},

-    },

-    {

-      // Coeff Band ( 5 )

-      {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0},

-      {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157},

-      { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198},

-    },

-    {

-      // Coeff Band ( 6 )

-      {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0},

-      {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195},

-      { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641},

-      {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30},

-    },

-  },

-  {

-    // Block Type ( 2 )

-    {

-      // Coeff Band ( 0 )

-      { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798},

-      {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837},

-      {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122},

-    },

-    {

-      // Coeff Band ( 1 )

-      {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0},

-      {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063},

-      {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047},

-    },

-    {

-      // Coeff Band ( 2 )

-      { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0},

-      { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404},

-      { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236},

-    },

-    {

-      // Coeff Band ( 3 )

-      { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157},

-      { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300},

-    },

-    {

-      // Coeff Band ( 4 )

-      {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427},

-      {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7},

-    },

-    {

-      // Coeff Band ( 5 )

-      {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652},

-      {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30},

-    },

-    {

-      // Coeff Band ( 6 )

-      { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517},

-      {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-    },

-  },

-  {

-    // Block Type ( 3 )

-    {

-      // Coeff Band ( 0 )

-      {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694},

-      {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572},

-      {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284},

-    },

-    {

-      // Coeff Band ( 1 )

-      {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0},

-      {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280},

-      {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460},

-    },

-    {

-      // Coeff Band ( 2 )

-      {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0},

-      {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539},

-      {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138},

-    },

-    {

-      // Coeff Band ( 3 )

-      {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0},

-      {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181},

-      {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267},

-    },

-    {

-      // Coeff Band ( 4 )

-      {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0},

-      {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401},

-      {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268},

-    },

-    {

-      // Coeff Band ( 5 )

-      {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0},

-      {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811},

-      {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527},

-    },

-    {

-      // Coeff Band ( 6 )

-      {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0},

-      {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954},

-      {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459},

-      {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13},

-    },

-  },

-};

-// Update probabilities for the nodes in the token entropy tree.

-const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {

-  {

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },

-      {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-  {

-    {

-      {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },

-      {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },

-    },

-    {

-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-  {

-    {

-      {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-  {

-    {

-      {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },

-      {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-};

-#endif

--- a/vp9/common/vp9_convolve.c

+++ b/vp9/common/vp9_convolve.c

@@ -122,78 +122,6 @@

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-static inline uint8_t combine_qtr(uint8_t a, uint8_t b) {

-  return (((a) + (b) * 3 + 2) >> 2);

-}

-static inline uint8_t combine_3qtr(uint8_t a, uint8_t b) {

-  return (((a) * 3 + (b) + 2) >> 2);

-}

-static inline uint8_t combine_1by8(uint8_t a, uint8_t b) {

-  return (((a) * 1 + (b) * 7 + 4) >> 3);

-}

-static inline uint8_t combine_3by8(uint8_t a, uint8_t b) {

-  return (((a) * 3 + (b) * 5 + 4) >> 3);

-}

-static inline uint8_t combine_5by8(uint8_t a, uint8_t b) {

-  return (((a) * 5 + (b) * 3 + 4) >> 3);

-}

-static inline uint8_t combine_7by8(uint8_t a, uint8_t b) {

-  return (((a) * 7 + (b) * 1 + 4) >> 3);

-}

-// TODO(debargha): Implment with a separate weight parameter

-static void convolve_wtd_horiz_c(const uint8_t *src, int src_stride,

-                                 uint8_t *dst, int dst_stride,

-                                 const int16_t *filter_x0, int x_step_q4,

-                                 const int16_t *filter_y, int y_step_q4,

-                                 int w, int h, int taps,

-                                 uint8_t (*combine)(uint8_t a, uint8_t b)) {

-  int x, y, k, sum;

-  const int16_t *filter_x_base = filter_x0;

-#if ALIGN_FILTERS_256

-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

-#endif

-  /* Adjust base pointer address for this source line */

-  src -= taps / 2 - 1;

-  for (y = 0; y < h; ++y) {

-    /* Pointer to filter to use */

-    const int16_t *filter_x = filter_x0;

-    /* Initial phase offset */

-    int x0_q4 = (filter_x - filter_x_base) / taps;

-    int x_q4 = x0_q4;

-    for (x = 0; x < w; ++x) {

-      /* Per-pixel src offset */

-      int src_x = (x_q4 - x0_q4) >> 4;

-      for (sum = 0, k = 0; k < taps; ++k) {

-        sum += src[src_x + k] * filter_x[k];

-      }

-      sum += (VP9_FILTER_WEIGHT >> 1);

-      dst[x] = combine(dst[x], clip_pixel(sum >> VP9_FILTER_SHIFT));

-      /* Adjust source and filter to use for the next pixel */

-      x_q4 += x_step_q4;

-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-#endif

 static void convolve_vert_c(const uint8_t *src, int src_stride,

                             uint8_t *dst, int dst_stride,

                             const int16_t *filter_x, int x_step_q4,

@@ -279,52 +207,6 @@

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-static void convolve_wtd_vert_c(const uint8_t *src, int src_stride,

-                                uint8_t *dst, int dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y0, int y_step_q4,

-                                int w, int h, int taps,

-                                uint8_t (*combine)(uint8_t a, uint8_t b)) {

-  int x, y, k, sum;

-  const int16_t *filter_y_base = filter_y0;

-#if ALIGN_FILTERS_256

-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

-#endif

-  /* Adjust base pointer address for this source column */

-  src -= src_stride * (taps / 2 - 1);

-  for (x = 0; x < w; ++x) {

-    /* Pointer to filter to use */

-    const int16_t *filter_y = filter_y0;

-    /* Initial phase offset */

-    int y0_q4 = (filter_y - filter_y_base) / taps;

-    int y_q4 = y0_q4;

-    for (y = 0; y < h; ++y) {

-      /* Per-pixel src offset */

-      int src_y = (y_q4 - y0_q4) >> 4;

-      for (sum = 0, k = 0; k < taps; ++k) {

-        sum += src[(src_y + k) * src_stride] * filter_y[k];

-      }

-      sum += (VP9_FILTER_WEIGHT >> 1);

-      dst[y * dst_stride] = combine(dst[y * dst_stride],

-                                    clip_pixel(sum >> VP9_FILTER_SHIFT));

-      /* Adjust source and filter to use for the next pixel */

-      y_q4 += y_step_q4;

-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;

-    }

-    ++src;

-    ++dst;

-  }

-}

-#endif

 static void convolve_c(const uint8_t *src, int src_stride,

                        uint8_t *dst, int dst_stride,

                        const int16_t *filter_x, int x_step_q4,

@@ -331,14 +213,14 @@

                        const int16_t *filter_y, int y_step_q4,

                        int w, int h, int taps) {

   /* Fixed size intermediate buffer places limits on parameters.

-   * Maximum intermediate_height is 39, for y_step_q4 == 32,

-   * h == 16, taps == 8.

+   * Maximum intermediate_height is 135, for y_step_q4 == 32,

+   * h == 64, taps == 8.

*/

-  uint8_t temp[16 * 39];

+  uint8_t temp[64 * 135];

   int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;

-  assert(w <= 16);

-  assert(h <= 16);

+  assert(w <= 64);

+  assert(h <= 64);

   assert(taps <= 8);

   assert(y_step_q4 <= 32);

@@ -346,10 +228,10 @@

     intermediate_height = h;

   convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,

-                   temp, 16,

+                   temp, 64,

                    filter_x, x_step_q4, filter_y, y_step_q4,

                    w, intermediate_height, taps);

-  convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,

+  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,

                   filter_x, x_step_q4, filter_y, y_step_q4,

                   w, h, taps);

@@ -360,14 +242,14 @@

                            const int16_t *filter_y, int y_step_q4,

                            int w, int h, int taps) {

   /* Fixed size intermediate buffer places limits on parameters.

-   * Maximum intermediate_height is 39, for y_step_q4 == 32,

-   * h == 16, taps == 8.

+   * Maximum intermediate_height is 135, for y_step_q4 == 32,

+   * h == 64, taps == 8.

*/

-  uint8_t temp[16 * 39];

+  uint8_t temp[64 * 135];

   int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;

-  assert(w <= 16);

-  assert(h <= 16);

+  assert(w <= 64);

+  assert(h <= 64);

   assert(taps <= 8);

   assert(y_step_q4 <= 32);

@@ -375,10 +257,10 @@

     intermediate_height = h;

   convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,

-                   temp, 16,

+                   temp, 64,

                    filter_x, x_step_q4, filter_y, y_step_q4,

                    w, intermediate_height, taps);

-  convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,

+  convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,

                       filter_x, x_step_q4, filter_y, y_step_q4,

                       w, h, taps);

@@ -403,68 +285,6 @@

                        w, h, 8);

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-void vp9_convolve8_1by8_horiz_c(const uint8_t *src, int src_stride,

-                                uint8_t *dst, int dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4,

-                                int w, int h) {

-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

-                       filter_x, x_step_q4, filter_y, y_step_q4,

-                       w, h, 8, combine_1by8);

-}

-void vp9_convolve8_qtr_horiz_c(const uint8_t *src, int src_stride,

-                               uint8_t *dst, int dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4,

-                               int w, int h) {

-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

-                       filter_x, x_step_q4, filter_y, y_step_q4,

-                       w, h, 8, combine_qtr);

-}

-void vp9_convolve8_3by8_horiz_c(const uint8_t *src, int src_stride,

-                                uint8_t *dst, int dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4,

-                                int w, int h) {

-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

-                       filter_x, x_step_q4, filter_y, y_step_q4,

-                       w, h, 8, combine_3by8);

-}

-void vp9_convolve8_5by8_horiz_c(const uint8_t *src, int src_stride,

-                                uint8_t *dst, int dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4,

-                                int w, int h) {

-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

-                       filter_x, x_step_q4, filter_y, y_step_q4,

-                       w, h, 8, combine_5by8);

-}

-void vp9_convolve8_3qtr_horiz_c(const uint8_t *src, int src_stride,

-                                uint8_t *dst, int dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4,

-                                int w, int h) {

-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

-                       filter_x, x_step_q4, filter_y, y_step_q4,

-                       w, h, 8, combine_3qtr);

-}

-void vp9_convolve8_7by8_horiz_c(const uint8_t *src, int src_stride,

-                                uint8_t *dst, int dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4,

-                                int w, int h) {

-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

-                       filter_x, x_step_q4, filter_y, y_step_q4,

-                       w, h, 8, combine_7by8);

-}

-#endif

 void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,

                           uint8_t *dst, int dst_stride,

                           const int16_t *filter_x, int x_step_q4,

@@ -485,68 +305,6 @@

                       w, h, 8);

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-void vp9_convolve8_1by8_vert_c(const uint8_t *src, int src_stride,

-                               uint8_t *dst, int dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4,

-                               int w, int h) {

-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

-                      filter_x, x_step_q4, filter_y, y_step_q4,

-                      w, h, 8, combine_1by8);

-}

-void vp9_convolve8_qtr_vert_c(const uint8_t *src, int src_stride,

-                              uint8_t *dst, int dst_stride,

-                              const int16_t *filter_x, int x_step_q4,

-                              const int16_t *filter_y, int y_step_q4,

-                              int w, int h) {

-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

-                      filter_x, x_step_q4, filter_y, y_step_q4,

-                      w, h, 8, combine_qtr);

-}

-void vp9_convolve8_3by8_vert_c(const uint8_t *src, int src_stride,

-                               uint8_t *dst, int dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4,

-                               int w, int h) {

-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

-                      filter_x, x_step_q4, filter_y, y_step_q4,

-                      w, h, 8, combine_3by8);

-}

-void vp9_convolve8_5by8_vert_c(const uint8_t *src, int src_stride,

-                               uint8_t *dst, int dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4,

-                               int w, int h) {

-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

-                      filter_x, x_step_q4, filter_y, y_step_q4,

-                      w, h, 8, combine_5by8);

-}

-void vp9_convolve8_3qtr_vert_c(const uint8_t *src, int src_stride,

-                               uint8_t *dst, int dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4,

-                               int w, int h) {

-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

-                      filter_x, x_step_q4, filter_y, y_step_q4,

-                      w, h, 8, combine_3qtr);

-}

-void vp9_convolve8_7by8_vert_c(const uint8_t *src, int src_stride,

-                               uint8_t *dst, int dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4,

-                               int w, int h) {

-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

-                      filter_x, x_step_q4, filter_y, y_step_q4,

-                      w, h, 8, combine_7by8);

-}

-#endif

 void vp9_convolve8_c(const uint8_t *src, int src_stride,

                      uint8_t *dst, int dst_stride,

                      const int16_t *filter_x, int x_step_q4,

@@ -563,16 +321,16 @@

                          const int16_t *filter_y, int y_step_q4,

                          int w, int h) {

   /* Fixed size intermediate buffer places limits on parameters. */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

-  assert(w <= 16);

-  assert(h <= 16);

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);

+  assert(w <= 64);

+  assert(h <= 64);

   vp9_convolve8(src, src_stride,

-                temp, 16,

+                temp, 64,

                 filter_x, x_step_q4,

                 filter_y, y_step_q4,

                 w, h);

-  vp9_convolve_avg(temp, 16,

+  vp9_convolve_avg(temp, 64,

                    dst, dst_stride,

                    NULL, 0, /* These unused parameter should be removed! */

                    NULL, 0, /* These unused parameter should be removed! */

@@ -579,140 +337,6 @@

                    w, h);

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-void vp9_convolve8_1by8_c(const uint8_t *src, int src_stride,

-                         uint8_t *dst, int dst_stride,

-                         const int16_t *filter_x, int x_step_q4,

-                         const int16_t *filter_y, int y_step_q4,

-                         int w, int h) {

-  /* Fixed size intermediate buffer places limits on parameters. */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

-  assert(w <= 16);

-  assert(h <= 16);

-  vp9_convolve8(src, src_stride,

-                temp, 16,

-                filter_x, x_step_q4,

-                filter_y, y_step_q4,

-                w, h);

-  vp9_convolve_1by8(temp, 16,

-                    dst, dst_stride,

-                    NULL, 0, /* These unused parameter should be removed! */

-                    NULL, 0, /* These unused parameter should be removed! */

-                    w, h);

-}

-void vp9_convolve8_qtr_c(const uint8_t *src, int src_stride,

-                         uint8_t *dst, int dst_stride,

-                         const int16_t *filter_x, int x_step_q4,

-                         const int16_t *filter_y, int y_step_q4,

-                         int w, int h) {

-  /* Fixed size intermediate buffer places limits on parameters. */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

-  assert(w <= 16);

-  assert(h <= 16);

-  vp9_convolve8(src, src_stride,

-                temp, 16,

-                filter_x, x_step_q4,

-                filter_y, y_step_q4,

-                w, h);

-  vp9_convolve_qtr(temp, 16,

-                   dst, dst_stride,

-                   NULL, 0, /* These unused parameter should be removed! */

-                   NULL, 0, /* These unused parameter should be removed! */

-                   w, h);

-}

-void vp9_convolve8_3by8_c(const uint8_t *src, int src_stride,

-                         uint8_t *dst, int dst_stride,

-                         const int16_t *filter_x, int x_step_q4,

-                         const int16_t *filter_y, int y_step_q4,

-                         int w, int h) {

-  /* Fixed size intermediate buffer places limits on parameters. */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

-  assert(w <= 16);

-  assert(h <= 16);

-  vp9_convolve8(src, src_stride,

-                temp, 16,

-                filter_x, x_step_q4,

-                filter_y, y_step_q4,

-                w, h);

-  vp9_convolve_3by8(temp, 16,

-                    dst, dst_stride,

-                    NULL, 0, /* These unused parameter should be removed! */

-                    NULL, 0, /* These unused parameter should be removed! */

-                    w, h);

-}

-void vp9_convolve8_5by8_c(const uint8_t *src, int src_stride,

-                         uint8_t *dst, int dst_stride,

-                         const int16_t *filter_x, int x_step_q4,

-                         const int16_t *filter_y, int y_step_q4,

-                         int w, int h) {

-  /* Fixed size intermediate buffer places limits on parameters. */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

-  assert(w <= 16);

-  assert(h <= 16);

-  vp9_convolve8(src, src_stride,

-                temp, 16,

-                filter_x, x_step_q4,

-                filter_y, y_step_q4,

-                w, h);

-  vp9_convolve_5by8(temp, 16,

-                    dst, dst_stride,

-                    NULL, 0, /* These unused parameter should be removed! */

-                    NULL, 0, /* These unused parameter should be removed! */

-                    w, h);

-}

-void vp9_convolve8_3qtr_c(const uint8_t *src, int src_stride,

-                          uint8_t *dst, int dst_stride,

-                          const int16_t *filter_x, int x_step_q4,

-                          const int16_t *filter_y, int y_step_q4,

-                          int w, int h) {

-  /* Fixed size intermediate buffer places limits on parameters. */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

-  assert(w <= 16);

-  assert(h <= 16);

-  vp9_convolve8(src, src_stride,

-                temp, 16,

-                filter_x, x_step_q4,

-                filter_y, y_step_q4,

-                w, h);

-  vp9_convolve_3qtr(temp, 16,

-                    dst, dst_stride,

-                    NULL, 0, /* These unused parameter should be removed! */

-                    NULL, 0, /* These unused parameter should be removed! */

-                    w, h);

-}

-void vp9_convolve8_7by8_c(const uint8_t *src, int src_stride,

-                         uint8_t *dst, int dst_stride,

-                         const int16_t *filter_x, int x_step_q4,

-                         const int16_t *filter_y, int y_step_q4,

-                         int w, int h) {

-  /* Fixed size intermediate buffer places limits on parameters. */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

-  assert(w <= 16);

-  assert(h <= 16);

-  vp9_convolve8(src, src_stride,

-                temp, 16,

-                filter_x, x_step_q4,

-                filter_y, y_step_q4,

-                w, h);

-  vp9_convolve_7by8(temp, 16,

-                    dst, dst_stride,

-                    NULL, 0, /* These unused parameter should be removed! */

-                    NULL, 0, /* These unused parameter should be removed! */

-                    w, h);

-}

-#endif

 void vp9_convolve_copy(const uint8_t *src, int src_stride,

                        uint8_t *dst, int dst_stride,

                        const int16_t *filter_x, int filter_x_stride,

@@ -750,101 +374,3 @@

     dst += dst_stride;

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-void vp9_convolve_1by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int filter_x_stride,

-                       const int16_t *filter_y, int filter_y_stride,

-                       int w, int h) {

-  int x, y;

-  for (y = 0; y < h; ++y) {

-    for (x = 0; x < w; ++x) {

-      dst[x] = combine_1by8(dst[x], src[x]);

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_convolve_qtr(const uint8_t *src, int src_stride,

-                      uint8_t *dst, int dst_stride,

-                      const int16_t *filter_x, int filter_x_stride,

-                      const int16_t *filter_y, int filter_y_stride,

-                      int w, int h) {

-  int x, y;

-  for (y = 0; y < h; ++y) {

-    for (x = 0; x < w; ++x) {

-      dst[x] = combine_qtr(dst[x], src[x]);

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_convolve_3by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int filter_x_stride,

-                       const int16_t *filter_y, int filter_y_stride,

-                       int w, int h) {

-  int x, y;

-  for (y = 0; y < h; ++y) {

-    for (x = 0; x < w; ++x) {

-      dst[x] = combine_3by8(dst[x], src[x]);

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_convolve_5by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int filter_x_stride,

-                       const int16_t *filter_y, int filter_y_stride,

-                       int w, int h) {

-  int x, y;

-  for (y = 0; y < h; ++y) {

-    for (x = 0; x < w; ++x) {

-      dst[x] = combine_5by8(dst[x], src[x]);

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_convolve_3qtr(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int filter_x_stride,

-                       const int16_t *filter_y, int filter_y_stride,

-                       int w, int h) {

-  int x, y;

-  for (y = 0; y < h; ++y) {

-    for (x = 0; x < w; ++x) {

-      dst[x] = combine_3qtr(dst[x], src[x]);

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_convolve_7by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int filter_x_stride,

-                       const int16_t *filter_y, int filter_y_stride,

-                       int w, int h) {

-  int x, y;

-  for (y = 0; y < h; ++y) {

-    for (x = 0; x < w; ++x) {

-      dst[x] = combine_7by8(dst[x], src[x]);

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-#endif

--- a/vp9/common/vp9_convolve.h

+++ b/vp9/common/vp9_convolve.h

@@ -33,50 +33,6 @@

                       const int16_t *filter_y, int y_step_q4,

                       int w, int h);

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-// Not a convolution, a block wtd (1/8, 7/8) average for (dst, src)

-void vp9_convolve_1by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int x_step_q4,

-                       const int16_t *filter_y, int y_step_q4,

-                       int w, int h);

-// Not a convolution, a block wtd (1/4, 3/4) average for (dst, src)

-void vp9_convolve_qtr(const uint8_t *src, int src_stride,

-                      uint8_t *dst, int dst_stride,

-                      const int16_t *filter_x, int x_step_q4,

-                      const int16_t *filter_y, int y_step_q4,

-                      int w, int h);

-// Not a convolution, a block wtd (3/8, 5/8) average for (dst, src)

-void vp9_convolve_3by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int x_step_q4,

-                       const int16_t *filter_y, int y_step_q4,

-                       int w, int h);

-// Not a convolution, a block wtd (5/8, 3/8) average for (dst, src)

-void vp9_convolve_5by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int x_step_q4,

-                       const int16_t *filter_y, int y_step_q4,

-                       int w, int h);

-// Not a convolution, a block wtd (3/4, 1/4) average for (dst, src)

-void vp9_convolve_3qtr(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int x_step_q4,

-                       const int16_t *filter_y, int y_step_q4,

-                       int w, int h);

-// Not a convolution, a block wtd (7/8, 1/8) average for (dst, src)

-void vp9_convolve_7by8(const uint8_t *src, int src_stride,

-                       uint8_t *dst, int dst_stride,

-                       const int16_t *filter_x, int x_step_q4,

-                       const int16_t *filter_y, int y_step_q4,

-                       int w, int h);

-#endif

 struct subpix_fn_table {

   const int16_t (*filter_x)[8];

   const int16_t (*filter_y)[8];

--- a/vp9/common/vp9_debugmodes.c

+++ b/vp9/common/vp9_debugmodes.c

@@ -13,130 +13,124 @@

 #include "vp9/common/vp9_blockd.h"

 void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,

-                                        int frame) {

-  int mb_row;

-  int mb_col;

-  int mb_index = 0;

-  FILE *mvs = fopen("mvs.stt", "a");

+                                        int frame, char *file) {

+  int mi_row;

+  int mi_col;

+  int mi_index = 0;

+  FILE *mvs = fopen(file, "a");

   // Print out the macroblock Y modes

-  fprintf(mvs, "Mb Modes for Frame %d\n", frame);

+  fprintf(mvs, "SB Types for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

+  for (mi_row = 0; mi_row < rows; mi_row++) {

+    for (mi_col = 0; mi_col < cols; mi_col++) {

+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type);

-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);

+      mi_index++;

+    }

-      mb_index++;

+    fprintf(mvs, "\n");

+    mi_index += 8;

+  }

+  // Print out the macroblock Y modes

+  fprintf(mvs, "Mb Modes for Frame %d\n", frame);

+  mi_index = 0;

+  for (mi_row = 0; mi_row < rows; mi_row++) {

+    for (mi_col = 0; mi_col < cols; mi_col++) {

+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode);

+      mi_index++;

     fprintf(mvs, "\n");

-    mb_index++;

+    mi_index += 8;

   fprintf(mvs, "\n");

-  mb_index = 0;

+  mi_index = 0;

   fprintf(mvs, "Mb mv ref for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

+  for (mi_row = 0; mi_row < rows; mi_row++) {

+    for (mi_col = 0; mi_col < cols; mi_col++) {

+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]);

-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);

-      mb_index++;

+      mi_index++;

     fprintf(mvs, "\n");

-    mb_index++;

+    mi_index += 8;

   fprintf(mvs, "\n");

-  /* print out the macroblock UV modes */

-  mb_index = 0;

-  fprintf(mvs, "UV Modes for Frame %d\n", frame);

+  mi_index = 0;

+  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

+  for (mi_row = 0; mi_row < rows; mi_row++) {

+    for (mi_col = 0; mi_col < cols; mi_col++) {

+      fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,

+              mi[mi_index].mbmi.mv[0].as_mv.col);

-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);

-      mb_index++;

+      mi_index++;

-    mb_index++;

     fprintf(mvs, "\n");

+    mi_index += 8;

   fprintf(mvs, "\n");

-  /* print out the block modes */

-  mb_index = 0;

-  fprintf(mvs, "Mbs for Frame %d\n", frame);

-  {

-    int b_row;

+  /* print out the macroblock txform sizes */

+  mi_index = 0;

+  fprintf(mvs, "TXFM size for Frame %d\n", frame);

-    for (b_row = 0; b_row < 4 * rows; b_row++) {

-      int b_col;

-      int bindex;

+  for (mi_row = 0; mi_row < rows; mi_row++) {

+    for (mi_col = 0; mi_col < cols; mi_col++) {

+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size);

-      for (b_col = 0; b_col < 4 * cols; b_col++) {

-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);

-        bindex = (b_row & 3) * 4 + (b_col & 3);

-        if (mi[mb_index].mbmi.mode == B_PRED) {

-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);

-        } else

-          fprintf(mvs, "xx ");

-      }

-      fprintf(mvs, "\n");

+      mi_index++;

+    mi_index += 8;

+    fprintf(mvs, "\n");

   fprintf(mvs, "\n");

-  /* print out the macroblock mvs */

-  mb_index = 0;

-  fprintf(mvs, "MVs for Frame %d\n", frame);

+  /* print out the macroblock UV modes */

+  mi_index = 0;

+  fprintf(mvs, "UV Modes for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

-      fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,

-          mi[mb_index].mbmi.mv[0].as_mv.col / 2);

+  for (mi_row = 0; mi_row < rows; mi_row++) {

+    for (mi_col = 0; mi_col < cols; mi_col++) {

+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode);

-      mb_index++;

+      mi_index++;

-    mb_index++;

+    mi_index += 8;

     fprintf(mvs, "\n");

   fprintf(mvs, "\n");

-  /* print out the block modes */

-  mb_index = 0;

+  /* print out the macroblock mvs */

+  mi_index = 0;

   fprintf(mvs, "MVs for Frame %d\n", frame);

-  {

-    int b_row;

-    for (b_row = 0; b_row < 4 * rows; b_row++) {

-      int b_col;

-      int bindex;

+  for (mi_row = 0; mi_row < rows; mi_row++) {

+    for (mi_col = 0; mi_col < cols; mi_col++) {

+      fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2,

+              mi[mi_index].mbmi.mv[0].as_mv.col / 2);

-      for (b_col = 0; b_col < 4 * cols; b_col++) {

-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);

-        bindex = (b_row & 3) * 4 + (b_col & 3);

-        fprintf(mvs, "%3d:%-3d ",

-                mi[mb_index].bmi[bindex].as_mv[0].as_mv.row,

-                mi[mb_index].bmi[bindex].as_mv[0].as_mv.col);

-      }

-      fprintf(mvs, "\n");

+      mi_index++;

+    mi_index += 8;

+    fprintf(mvs, "\n");

   fprintf(mvs, "\n");

   fclose(mvs);

--- a/vp9/common/vp9_default_coef_probs.h

+++ b/vp9/common/vp9_default_coef_probs.h

@@ -11,987 +11,1374 @@

 /*Generated file, included by vp9_entropy.c*/

-// NOTE: When the CONFIG_MODELCOEFPROB experiment is on, only the first

-// 2 or 3 from each row is actually used depending on whether

-// UNCONSTRAINDED_NODES is 2 or 3. If this experiment is merged

-// the tables below should be shortened accordingly.

-static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES] = {

+#if CONFIG_BALANCED_COEFTREE

+static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {

   { /* block Type 0 */

     { /* Intra */

       { /* Coeff Band 0 */

-        { 208,  32, 178, 198, 161, 167, 196, 147, 244, 194, 210 },

-        { 102,  43, 132, 185, 148, 162, 185, 141, 237, 181, 215 },

-        {  15,  36,  68, 143, 119, 151, 169, 133, 230, 173, 214 }

+        {   6, 213, 178 },

+        {  26, 113, 132 },

+        {  34,  17,  68 }

       }, { /* Coeff Band 1 */

-        {  71,  91, 178, 226, 169, 176, 232, 170, 252, 219, 231 },

-        {  72,  88, 174, 226, 168, 176, 232, 170, 252, 219, 234 },

-        {  40,  79, 154, 222, 161, 174, 231, 169, 251, 219, 238 },

-        {  21,  68, 126, 211, 144, 167, 230, 167, 252, 219, 236 },

-        {   7,  49,  84, 175, 121, 152, 223, 151, 251, 218, 237 },

-        {   1,  20,  32, 100,  97, 140, 163, 116, 237, 186, 222 }

+        {  66,  96, 178 },

+        {  63,  96, 174 },

+        {  67,  54, 154 },

+        {  62,  28, 126 },

+        {  48,   9,  84 },

+        {  20,   1,  32 }

       }, { /* Coeff Band 2 */

-        { 108, 110, 206, 237, 182, 183, 239, 181, 252, 221, 245 },

-        {  72,  98, 191, 236, 180, 182, 240, 183, 252, 223, 239 },

-        {  26,  77, 152, 230, 166, 179, 239, 181, 252, 222, 241 },

-        {   7,  57, 106, 212, 141, 167, 236, 173, 252, 223, 243 },

-        {   1,  35,  60, 171, 110, 149, 225, 155, 251, 218, 240 },

-        {   1,  14,  22,  90,  86, 134, 163, 116, 238, 181, 233 }

+        {  64, 144, 206 },

+        {  70,  99, 191 },

+        {  69,  36, 152 },

+        {  55,   9, 106 },

+        {  35,   1,  60 },

+        {  14,   1,  22 }

       }, { /* Coeff Band 3 */

-        { 105, 139, 222, 245, 196, 192, 245, 195, 253, 229, 255 },

-        {  76, 118, 205, 245, 192, 192, 247, 198, 254, 230, 255 },

-        {  21,  88, 164, 240, 175, 186, 246, 197, 255, 232, 255 },

-        {   5,  63, 118, 222, 149, 172, 242, 185, 255, 230, 254 },

-        {   1,  42,  74, 186, 120, 157, 227, 161, 253, 220, 250 },

-        {   1,  18,  30,  97,  92, 136, 163, 118, 244, 184, 244 }

+        {  82, 154, 222 },

+        {  83, 112, 205 },

+        {  81,  31, 164 },

+        {  62,   7, 118 },

+        {  42,   1,  74 },

+        {  18,   1,  30 }

       }, { /* Coeff Band 4 */

-        { 143, 117, 233, 251, 207, 201, 250, 210, 255, 239, 128 },

-        {  99, 104, 214, 249, 200, 199, 251, 211, 255, 238, 255 },

-        {  26,  81, 170, 245, 183, 192, 250, 206, 255, 242, 255 },

-        {   6,  60, 116, 226, 151, 176, 242, 187, 255, 235, 255 },

-        {   1,  38,  65, 178, 114, 153, 224, 157, 254, 224, 255 },

-        {   1,  15,  26,  86,  88, 133, 163, 110, 251, 197, 252 }

+        {  52, 179, 233 },

+        {  64, 132, 214 },

+        {  73,  36, 170 },

+        {  59,   8, 116 },

+        {  38,   1,  65 },

+        {  15,   1,  26 }

       }, { /* Coeff Band 5 */

-        { 155,  74, 238, 252, 215, 206, 252, 223, 255, 255, 128 },

-        { 152,  64, 223, 250, 205, 201, 254, 219, 255, 255, 128 },

-        {  67,  55, 182, 246, 187, 192, 251, 210, 255, 240, 128 },

-        {  27,  44, 127, 227, 155, 176, 244, 186, 255, 240, 255 },

-        {   9,  27,  69, 176, 115, 152, 227, 154, 255, 229, 255 },

-        {   2,  11,  28,  91,  84, 133, 177, 115, 254, 210, 255 }

+        {  29, 175, 238 },

+        {  26, 169, 223 },

+        {  41,  80, 182 },

+        {  39,  32, 127 },

+        {  26,  10,  69 },

+        {  11,   2,  28 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        { 207, 112, 234, 244, 192, 193, 246, 194, 255, 237, 255 },

-        { 145, 120, 212, 233, 178, 183, 232, 177, 252, 216, 228 },

-        {  77, 114, 177, 214, 164, 174, 210, 159, 245, 199, 230 }

+        {  21, 226, 234 },

+        {  52, 182, 212 },

+        {  80, 112, 177 }

       }, { /* Coeff Band 1 */

-        {  93, 174, 243, 248, 205, 200, 245, 195, 255, 232, 255 },

-        { 100, 144, 231, 248, 204, 200, 244, 193, 255, 232, 255 },

-        {  28, 101, 186, 247, 194, 199, 244, 194, 255, 232, 255 },

-        {   9,  73, 132, 238, 155, 186, 245, 197, 255, 232, 250 },

-        {   2,  44,  76, 187, 112, 151, 240, 172, 255, 235, 249 },

-        {   1,  19,  33,  98,  92, 138, 176, 113, 252, 208, 249 }

+        { 111, 164, 243 },

+        {  88, 152, 231 },

+        {  90,  43, 186 },

+        {  70,  12, 132 },

+        {  44,   2,  76 },

+        {  19,   1,  33 }

       }, { /* Coeff Band 2 */

-        { 116, 175, 246, 250, 212, 202, 248, 198, 255, 238, 255 },

-        {  78, 142, 231, 250, 208, 203, 249, 200, 255, 241, 255 },

-        {  14,  93, 177, 245, 186, 196, 248, 198, 255, 241, 255 },

-        {   4,  65, 122, 227, 148, 177, 244, 186, 255, 241, 243 },

-        {   1,  38,  69, 180, 111, 152, 235, 162, 255, 237, 247 },

-        {   1,  18,  30, 101,  89, 133, 190, 116, 255, 219, 246 }

+        {  96, 185, 246 },

+        {  99, 127, 231 },

+        {  88,  21, 177 },

+        {  64,   5, 122 },

+        {  38,   1,  69 },

+        {  18,   1,  30 }

       }, { /* Coeff Band 3 */

-        { 138, 183, 249, 253, 220, 209, 252, 210, 255, 251, 128 },

-        {  93, 147, 237, 252, 213, 209, 253, 213, 255, 251, 128 },

-        {  21, 104, 187, 247, 185, 196, 252, 210, 255, 249, 128 },

-        {   6,  73, 131, 225, 147, 174, 248, 190, 255, 248, 128 },

-        {   1,  47,  83, 189, 119, 155, 239, 167, 255, 246, 128 },

-        {   1,  26,  44, 130,  96, 139, 209, 129, 255, 235, 255 }

+        {  84, 206, 249 },

+        {  94, 147, 237 },

+        {  95,  33, 187 },

+        {  71,   8, 131 },

+        {  47,   1,  83 },

+        {  26,   1,  44 }

       }, { /* Coeff Band 4 */

-        { 188, 143, 252, 255, 228, 218, 253, 218, 255, 209, 128 },

-        { 137, 124, 241, 253, 215, 211, 254, 221, 255, 255, 128 },

-        {  32,  89, 188, 248, 186, 198, 254, 216, 255, 253, 128 },

-        {   7,  61, 122, 231, 146, 176, 252, 201, 255, 250, 128 },

-        {   1,  34,  66, 186, 103, 149, 246, 176, 255, 249, 128 },

-        {   1,  18,  34, 115,  91, 134, 217, 124, 255, 233, 255 }

+        {  38, 221, 252 },

+        {  58, 177, 241 },

+        {  78,  46, 188 },

+        {  59,   9, 122 },

+        {  34,   1,  66 },

+        {  18,   1,  34 }

       }, { /* Coeff Band 5 */

-        { 198,  92, 253, 255, 231, 222, 255, 230, 128, 128, 128 },

-        { 189,  79, 244, 254, 220, 217, 255, 237, 255, 255, 128 },

-        {  78,  61, 200, 252, 196, 207, 255, 231, 255, 255, 128 },

-        {  34,  50, 146, 242, 161, 187, 255, 222, 255, 255, 128 },

-        {  11,  38,  93, 215, 122, 159, 253, 202, 255, 255, 128 },

-        {   1,  31,  55, 143, 102, 143, 227, 148, 255, 238, 128 }

+        {  21, 216, 253 },

+        {  21, 206, 244 },

+        {  42,  93, 200 },

+        {  43,  41, 146 },

+        {  36,  13,  93 },

+        {  31,   1,  55 }

   }, { /* block Type 1 */

     { /* Intra */

       { /* Coeff Band 0 */

-        { 207,  35, 219, 243, 195, 192, 243, 188, 251, 232, 238 },

-        { 126,  46, 182, 230, 177, 182, 228, 171, 248, 214, 232 },

-        {  51,  47, 125, 196, 147, 166, 206, 151, 245, 199, 229 }

+        {   7, 213, 219 },

+        {  23, 139, 182 },

+        {  38,  60, 125 }

       }, { /* Coeff Band 1 */

-        { 114, 124, 220, 244, 197, 192, 242, 189, 253, 226, 255 },

-        { 142, 116, 213, 243, 194, 191, 241, 188, 252, 226, 255 },

-        {  81, 101, 190, 242, 188, 190, 242, 190, 253, 229, 255 },

-        {  42,  83, 155, 235, 166, 183, 241, 190, 253, 227, 246 },

-        {  16,  62, 104, 205, 133, 161, 238, 176, 254, 227, 250 },

-        {   6,  40,  60, 132, 109, 145, 190, 128, 248, 202, 239 }

+        {  69, 156, 220 },

+        {  52, 178, 213 },

+        {  69, 111, 190 },

+        {  69,  58, 155 },

+        {  58,  21, 104 },

+        {  39,   7,  60 }

       }, { /* Coeff Band 2 */

-        { 139, 149, 228, 248, 205, 198, 244, 196, 255, 223, 255 },

-        { 115, 127, 221, 248, 202, 198, 245, 198, 255, 228, 255 },

-        {  43, 100, 189, 246, 195, 195, 244, 196, 254, 234, 228 },

-        {  13,  77, 141, 238, 168, 187, 243, 191, 255, 232, 255 },

-        {   3,  49,  88, 203, 125, 160, 237, 178, 253, 227, 251 },

-        {   1,  23,  41, 118,  97, 136, 191, 127, 250, 207, 247 }

+        {  68, 189, 228 },

+        {  70, 158, 221 },

+        {  83,  64, 189 },

+        {  73,  18, 141 },

+        {  48,   4,  88 },

+        {  23,   1,  41 }

       }, { /* Coeff Band 3 */

-        { 119, 185, 236, 251, 216, 205, 249, 202, 253, 237, 255 },

-        {  89, 140, 224, 251, 211, 205, 250, 208, 255, 241, 255 },

-        {  34, 105, 189, 248, 195, 197, 250, 208, 255, 245, 255 },

-        {  14,  78, 142, 235, 166, 182, 246, 194, 255, 242, 255 },

-        {   5,  49,  90, 196, 128, 160, 235, 165, 255, 237, 255 },

-        {   1,  22,  41, 114,  97, 139, 180, 124, 252, 201, 249 }

+        {  99, 194, 236 },

+        {  91, 138, 224 },

+        {  91,  53, 189 },

+        {  74,  20, 142 },

+        {  48,   6,  90 },

+        {  22,   1,  41 }

       }, { /* Coeff Band 4 */

-        { 162, 142, 244, 254, 228, 215, 255, 230, 128, 128, 128 },

-        { 129, 120, 231, 253, 216, 210, 255, 228, 255, 255, 128 },

-        {  44,  90, 189, 249, 195, 199, 253, 217, 255, 240, 128 },

-        {  14,  65, 132, 234, 158, 181, 249, 203, 255, 248, 128 },

-        {   3,  38,  72, 188, 112, 154, 239, 171, 255, 243, 128 },

-        {   1,  17,  39, 110,  86, 141, 201, 123, 255, 240, 128 }

+        {  52, 203, 244 },

+        {  60, 168, 231 },

+        {  75,  62, 189 },

+        {  61,  18, 132 },

+        {  38,   4,  72 },

+        {  17,   1,  39 }

       }, { /* Coeff Band 5 */

-        { 167,  96, 247, 255, 230, 218, 249, 231, 255, 255, 128 },

-        { 163,  84, 234, 253, 214, 209, 255, 231, 255, 255, 128 },

-        {  70,  63, 185, 249, 189, 197, 255, 230, 255, 255, 128 },

-        {  30,  44, 132, 238, 157, 180, 251, 210, 255, 220, 128 },

-        {  13,  30,  80, 195, 121, 153, 243, 179, 255, 224, 128 },

-        {   5,  13,  38, 103, 109, 128, 196, 147, 255, 255, 128 }

+        {  33, 192, 247 },

+        {  31, 185, 234 },

+        {  46,  85, 185 },

+        {  39,  35, 132 },

+        {  28,  15,  80 },

+        {  13,   5,  38 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        { 242,  90, 246, 244, 200, 192, 242, 189, 255, 234, 255 },

-        { 186, 102, 228, 233, 187, 182, 231, 172, 254, 225, 252 },

-        { 102, 108, 203, 228, 181, 180, 218, 167, 243, 201, 223 }

+        {   5, 247, 246 },

+        {  28, 209, 228 },

+        {  65, 137, 203 }

       }, { /* Coeff Band 1 */

-        { 152, 169, 250, 253, 223, 209, 251, 208, 255, 250, 128 },

-        { 164, 149, 242, 253, 222, 209, 249, 207, 253, 238, 255 },

-        {  63, 108, 204, 252, 215, 211, 251, 211, 255, 242, 128 },

-        {  39,  83, 153, 248, 175, 199, 250, 214, 255, 245, 128 },

-        {  31,  66, 108, 214, 130, 161, 251, 196, 255, 237, 128 },

-        {  27,  65,  71, 150, 112, 149, 213, 133, 255, 230, 255 }

+        {  69, 208, 250 },

+        {  54, 207, 242 },

+        {  81,  92, 204 },

+        {  70,  54, 153 },

+        {  58,  40, 108 },

+        {  58,  35,  71 }

       }, { /* Coeff Band 2 */

-        { 161, 174, 250, 254, 226, 215, 254, 226, 255, 230, 128 },

-        { 133, 150, 239, 254, 222, 213, 254, 225, 255, 255, 128 },

-        {  32, 105, 197, 252, 206, 207, 253, 220, 255, 255, 128 },

-        {  10,  78, 147, 245, 173, 193, 253, 212, 255, 255, 128 },

-        {   2,  49,  99, 221, 133, 164, 250, 198, 255, 252, 128 },

-        {   1,  26,  53, 154,  96, 135, 234, 142, 255, 240, 128 }

+        {  65, 215, 250 },

+        {  72, 185, 239 },

+        {  92,  50, 197 },

+        {  75,  14, 147 },

+        {  49,   2,  99 },

+        {  26,   1,  53 }

       }, { /* Coeff Band 3 */

-        { 160, 187, 251, 255, 234, 223, 255, 233, 128, 128, 128 },

-        { 131, 155, 241, 255, 228, 222, 255, 232, 255, 255, 128 },

-        {  42, 108, 198, 253, 207, 212, 255, 234, 255, 255, 128 },

-        {  18,  81, 151, 246, 176, 194, 254, 222, 255, 255, 128 },

-        {   9,  60, 112, 225, 144, 167, 252, 199, 255, 255, 128 },

-        {   5,  35,  49, 163, 113, 150, 237, 118, 255, 255, 128 }

+        {  70, 220, 251 },

+        {  76, 186, 241 },

+        {  90,  65, 198 },

+        {  75,  26, 151 },

+        {  58,  12, 112 },

+        {  34,   6,  49 }

       }, { /* Coeff Band 4 */

-        { 195, 141, 253, 255, 242, 232, 255, 255, 128, 128, 128 },

-        { 169, 128, 245, 255, 235, 227, 255, 248, 128, 128, 128 },

-        {  62,  91, 204, 255, 216, 220, 255, 233, 128, 128, 128 },

-        {  23,  70, 150, 248, 178, 202, 255, 223, 128, 128, 128 },

-        {   2,  44,  78, 220, 110, 164, 255, 209, 128, 128, 128 },

-        {   1,   1, 128, 255, 255, 128, 128, 128, 128, 128, 128 }

+        {  34, 224, 253 },

+        {  44, 204, 245 },

+        {  69,  85, 204 },

+        {  64,  31, 150 },

+        {  44,   2,  78 },

+        {   1,   1, 128 }

       }, { /* Coeff Band 5 */

-        { 195, 104, 253, 255, 246, 246, 255, 171, 128, 128, 128 },

-        { 197,  92, 248, 255, 239, 228, 255, 239, 128, 128, 128 },

-        {  88,  71, 214, 255, 219, 220, 255, 244, 128, 128, 128 },

-        {  39,  56, 160, 250, 187, 204, 255, 255, 128, 128, 128 },

-        {  18,  28,  90, 217,  81, 137, 255, 128, 128, 128, 128 },

-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+        {  25, 216, 253 },

+        {  21, 215, 248 },

+        {  47, 108, 214 },

+        {  47,  48, 160 },

+        {  26,  20,  90 },

+        {  64, 171, 128 }

};

-static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES] = {

+static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {

   { /* block Type 0 */

     { /* Intra */

       { /* Coeff Band 0 */

-        { 196,  40, 199, 180, 158, 161, 172, 135, 226, 183, 140 },

-        {  83,  38, 128, 153, 142, 157, 155, 128, 222, 164, 202 },

-        {  10,  29,  55, 116, 113, 146, 150, 122, 223, 169, 200 }

+        {   9, 203, 199 },

+        {  26,  92, 128 },

+        {  28,  11,  55 }

       }, { /* Coeff Band 1 */

-        {  33, 114, 160, 211, 155, 169, 223, 162, 248, 212, 215 },

-        {  69, 107, 155, 210, 154, 169, 224, 163, 248, 212, 216 },

-        {  30,  91, 138, 207, 150, 168, 223, 162, 248, 212, 216 },

-        {  12,  74, 115, 200, 140, 164, 222, 160, 249, 212, 219 },

-        {   4,  52,  80, 172, 121, 153, 216, 149, 249, 212, 226 },

-        {   1,  27,  40, 105, 101, 141, 157, 120, 231, 177, 210 }

+        {  99,  54, 160 },

+        {  78,  99, 155 },

+        {  80,  44, 138 },

+        {  71,  17, 115 },

+        {  51,   5,  80 },

+        {  27,   1,  40 }

       }, { /* Coeff Band 2 */

-        {  38, 159, 190, 227, 171, 177, 229, 172, 250, 214, 237 },

-        {  34, 130, 182, 229, 173, 180, 231, 174, 249, 215, 234 },

-        {  10,  97, 153, 226, 164, 178, 232, 175, 250, 215, 241 },

-        {   3,  71, 115, 213, 145, 170, 230, 171, 251, 217, 235 },

-        {   1,  41,  68, 172, 114, 152, 219, 154, 250, 212, 235 },

-        {   1,  16,  27,  88,  90, 135, 155, 113, 235, 180, 216 }

+        { 135,  81, 190 },

+        { 113,  61, 182 },

+        {  93,  16, 153 },

+        {  70,   4, 115 },

+        {  41,   1,  68 },

+        {  16,   1,  27 }

       }, { /* Coeff Band 3 */

-        {  41, 184, 214, 238, 187, 186, 235, 180, 252, 217, 236 },

-        {  24, 142, 199, 241, 188, 189, 237, 184, 252, 220, 235 },

-        {   6,  97, 159, 235, 172, 184, 239, 185, 252, 221, 243 },

-        {   1,  63, 110, 214, 144, 170, 234, 174, 253, 223, 243 },

-        {   1,  32,  58, 166, 109, 149, 218, 152, 251, 215, 238 },

-        {   1,  12,  21,  78,  85, 131, 152, 109, 236, 180, 224 }

+        { 155, 103, 214 },

+        { 129,  48, 199 },

+        {  95,  10, 159 },

+        {  63,   1, 110 },

+        {  32,   1,  58 },

+        {  12,   1,  21 }

       }, { /* Coeff Band 4 */

-        {  54, 207, 231, 245, 201, 193, 238, 186, 252, 221, 220 },

-        {  32, 156, 213, 246, 198, 195, 242, 192, 252, 224, 245 },

-        {   7,  98, 164, 240, 177, 187, 243, 193, 252, 227, 244 },

-        {   2,  62, 108, 216, 143, 170, 237, 177, 254, 227, 248 },

-        {   1,  32,  57, 165, 108, 148, 219, 152, 252, 217, 243 },

-        {   1,  13,  22,  79,  87, 132, 153, 109, 240, 182, 232 }

+        { 163, 149, 231 },

+        { 137,  69, 213 },

+        {  95,  11, 164 },

+        {  62,   3, 108 },

+        {  32,   1,  57 },

+        {  13,   1,  22 }

       }, { /* Coeff Band 5 */

-        {  89, 208, 239, 250, 216, 200, 240, 190, 255, 222, 219 },

-        {  53, 155, 223, 250, 209, 202, 245, 199, 253, 225, 246 },

-        {  12, 102, 170, 243, 183, 192, 246, 198, 254, 230, 255 },

-        {   3,  67, 111, 218, 144, 171, 239, 180, 254, 231, 248 },

-        {   1,  38,  60, 164, 108, 148, 221, 152, 253, 220, 246 },

-        {   1,  18,  26,  81,  88, 132, 157, 108, 245, 188, 241 }

+        { 136, 189, 239 },

+        { 123, 102, 223 },

+        {  97,  19, 170 },

+        {  66,   4, 111 },

+        {  38,   1,  60 },

+        {  18,   1,  26 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        { 205, 121, 244, 237, 187, 188, 229, 174, 248, 215, 228 },

-        { 140, 120, 211, 219, 174, 177, 207, 158, 241, 195, 214 },

-        {  51, 100, 152, 198, 155, 168, 199, 148, 240, 193, 207 }

+        {  24, 226, 244 },

+        {  54, 178, 211 },

+        {  80,  74, 152 }

       }, { /* Coeff Band 1 */

-        {  66, 196, 236, 247, 202, 197, 243, 193, 254, 228, 246 },

-        {  99, 164, 223, 246, 199, 196, 243, 193, 254, 226, 255 },

-        {  29, 122, 187, 244, 187, 194, 244, 193, 255, 227, 239 },

-        {  14,  95, 145, 234, 156, 181, 244, 194, 254, 229, 246 },

-        {   6,  68,  97, 190, 123, 155, 240, 168, 254, 232, 245 },

-        {   3,  43,  50, 112, 105, 143, 170, 118, 245, 195, 230 }

+        { 145, 153, 236 },

+        { 101, 163, 223 },

+        { 108,  50, 187 },

+        {  90,  22, 145 },

+        {  66,   8,  97 },

+        {  42,   4,  50 }

       }, { /* Coeff Band 2 */

-        {  66, 202, 238, 248, 206, 199, 245, 196, 254, 233, 244 },

-        {  45, 155, 218, 248, 200, 199, 245, 197, 254, 229, 208 },

-        {   6,  96, 163, 242, 178, 191, 245, 196, 254, 233, 228 },

-        {   2,  64, 110, 224, 142, 175, 242, 185, 254, 232, 247 },

-        {   1,  34,  61, 172, 103, 147, 232, 164, 254, 226, 244 },

-        {   1,  13,  24,  82,  85, 133, 165, 105, 248, 199, 242 }

+        { 150, 159, 238 },

+        { 128,  90, 218 },

+        {  94,   9, 163 },

+        {  64,   3, 110 },

+        {  34,   1,  61 },

+        {  13,   1,  24 }

       }, { /* Coeff Band 3 */

-        {  66, 204, 242, 251, 213, 204, 248, 204, 255, 236, 255 },

-        {  38, 158, 222, 251, 206, 205, 249, 206, 255, 238, 255 },

-        {   6,  95, 166, 244, 178, 194, 249, 205, 255, 236, 255 },

-        {   2,  61, 111, 223, 141, 173, 244, 187, 255, 237, 255 },

-        {   1,  31,  59, 171, 104, 149, 230, 158, 255, 230, 252 },

-        {   1,  12,  22,  82,  79, 128, 171, 111, 251, 203, 249 }

+        { 151, 162, 242 },

+        { 135,  80, 222 },

+        {  93,   9, 166 },

+        {  61,   3, 111 },

+        {  31,   1,  59 },

+        {  12,   1,  22 }

       }, { /* Coeff Band 4 */

-        {  63, 214, 245, 252, 219, 208, 249, 206, 255, 241, 128 },

-        {  38, 164, 228, 252, 210, 208, 251, 212, 255, 245, 255 },

-        {   5, 101, 174, 246, 182, 196, 251, 207, 255, 244, 255 },

-        {   1,  64, 116, 224, 142, 174, 246, 190, 255, 241, 228 },

-        {   1,  34,  63, 172, 105, 148, 233, 160, 255, 235, 237 },

-        {   1,  14,  26,  88,  85, 130, 177, 110, 252, 210, 250 }

+        { 161, 170, 245 },

+        { 140,  84, 228 },

+        {  99,   8, 174 },

+        {  64,   1, 116 },

+        {  34,   1,  63 },

+        {  14,   1,  26 }

       }, { /* Coeff Band 5 */

-        {  91, 214, 246, 254, 226, 213, 251, 210, 255, 239, 255 },

-        {  55, 162, 233, 253, 215, 210, 253, 216, 255, 244, 128 },

-        {  10, 104, 179, 247, 184, 196, 252, 212, 255, 247, 255 },

-        {   2,  67, 119, 226, 143, 173, 249, 195, 255, 245, 255 },

-        {   1,  37,  66, 175, 106, 149, 237, 164, 255, 240, 255 },

-        {   1,  16,  30,  96,  87, 132, 188, 113, 255, 222, 255 }

+        { 138, 197, 246 },

+        { 127, 109, 233 },

+        { 100,  16, 179 },

+        {  66,   3, 119 },

+        {  37,   1,  66 },

+        {  16,   1,  30 }

   }, { /* block Type 1 */

     { /* Intra */

       { /* Coeff Band 0 */

-        { 211,  32, 212, 235, 185, 184, 223, 167, 239, 210, 182 },

-        { 121,  47, 171, 224, 171, 180, 211, 162, 238, 195, 221 },

-        {  40,  51, 118, 203, 145, 168, 211, 160, 246, 200, 236 }

+        {   6, 216, 212 },

+        {  25, 134, 171 },

+        {  43,  48, 118 }

       }, { /* Coeff Band 1 */

-        {  71, 129, 209, 244, 192, 194, 242, 188, 255, 230, 255 },

-        { 118, 122, 206, 244, 192, 192, 241, 187, 254, 227, 255 },

-        {  53, 104, 184, 241, 186, 190, 241, 184, 254, 232, 255 },

-        {  20,  81, 148, 234, 168, 183, 240, 183, 254, 231, 240 },

-        {   3,  47,  82, 197, 127, 160, 234, 166, 254, 228, 251 },

-        {   1,  18,  28,  96,  88, 134, 174, 116, 247, 194, 247 }

+        {  93, 112, 209 },

+        {  66, 159, 206 },

+        {  82,  78, 184 },

+        {  75,  28, 148 },

+        {  46,   4,  82 },

+        {  18,   1,  28 }

       }, { /* Coeff Band 2 */

-        {  86, 162, 220, 247, 203, 198, 245, 193, 255, 237, 255 },

-        {  84, 134, 216, 247, 201, 197, 244, 192, 255, 233, 255 },

-        {  26, 102, 186, 243, 190, 192, 244, 192, 255, 232, 255 },

-        {   7,  75, 135, 231, 163, 181, 240, 183, 255, 234, 255 },

-        {   1,  46,  79, 193, 121, 157, 233, 168, 255, 225, 242 },

-        {   1,  20,  35, 113,  94, 136, 191, 123, 252, 209, 250 }

+        { 108, 148, 220 },

+        {  90, 130, 216 },

+        {  92,  40, 186 },

+        {  73,  10, 135 },

+        {  46,   1,  79 },

+        {  20,   1,  35 }

       }, { /* Coeff Band 3 */

-        {  89, 191, 232, 250, 211, 203, 248, 202, 255, 230, 128 },

-        {  67, 148, 223, 250, 207, 201, 250, 207, 255, 247, 255 },

-        {  19, 105, 183, 245, 189, 193, 249, 202, 255, 244, 255 },

-        {   5,  72, 127, 228, 156, 177, 245, 186, 255, 238, 255 },

-        {   1,  44,  76, 190, 119, 156, 234, 167, 255, 231, 255 },

-        {   1,  21,  36, 116,  92, 138, 195, 128, 250, 208, 241 }

+        { 125, 173, 232 },

+        { 109, 117, 223 },

+        {  97,  31, 183 },

+        {  71,   7, 127 },

+        {  44,   1,  76 },

+        {  21,   1,  36 }

       }, { /* Coeff Band 4 */

-        {  94, 210, 236, 252, 215, 206, 253, 209, 255, 247, 128 },

-        {  68, 153, 224, 251, 209, 204, 251, 213, 255, 240, 128 },

-        {  14, 103, 178, 246, 188, 195, 251, 209, 255, 239, 128 },

-        {   2,  70, 122, 230, 154, 177, 247, 194, 255, 239, 128 },

-        {   1,  42,  72, 189, 115, 153, 234, 166, 255, 229, 255 },

-        {   1,  19,  34, 104,  98, 143, 180, 124, 252, 200, 255 }

+        { 133, 195, 236 },

+        { 112, 121, 224 },

+        {  97,  23, 178 },

+        {  69,   3, 122 },

+        {  42,   1,  72 },

+        {  19,   1,  34 }

       }, { /* Coeff Band 5 */

-        {  87, 200, 238, 254, 226, 214, 250, 212, 255, 226, 128 },

-        {  55, 151, 225, 253, 217, 212, 253, 217, 255, 233, 128 },

-        {  11, 106, 179, 249, 193, 200, 252, 213, 255, 247, 128 },

-        {   2,  72, 124, 232, 155, 180, 246, 195, 255, 230, 128 },

-        {   1,  42,  70, 182, 114, 153, 232, 163, 255, 236, 255 },

-        {   1,  17,  28,  95,  92, 137, 170, 115, 252, 208, 228 }

+        { 132, 180, 238 },

+        { 119, 102, 225 },

+        { 101,  18, 179 },

+        {  71,   3, 124 },

+        {  42,   1,  70 },

+        {  17,   1,  28 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        { 238,  66, 250, 245, 205, 193, 232, 180, 254, 228, 255 },

-        { 178,  84, 226, 237, 192, 185, 230, 176, 253, 217, 251 },

-        {  76,  83, 168, 218, 166, 173, 225, 162, 252, 220, 243 }

+        {   5, 242, 250 },

+        {  26, 198, 226 },

+        {  58,  98, 168 }

       }, { /* Coeff Band 1 */

-        { 137, 176, 246, 252, 218, 207, 251, 208, 255, 238, 128 },

-        { 176, 160, 237, 252, 217, 206, 249, 209, 255, 247, 128 },

-        {  68, 128, 205, 251, 209, 207, 251, 207, 255, 248, 128 },

-        {  40, 105, 167, 246, 172, 192, 252, 215, 255, 247, 128 },

-        {  22,  84, 131, 214, 144, 164, 249, 185, 255, 250, 255 },

-        {  11,  60,  91, 161, 130, 155, 194, 133, 253, 214, 255 }

+        {  82, 201, 246 },

+        {  50, 219, 237 },

+        {  94, 107, 205 },

+        {  89,  61, 167 },

+        {  77,  31, 131 },

+        {  57,  14,  91 }

       }, { /* Coeff Band 2 */

-        { 124, 192, 247, 253, 223, 210, 254, 215, 255, 255, 128 },

-        { 103, 161, 234, 253, 218, 209, 253, 214, 255, 255, 128 },

-        {  19, 108, 190, 250, 202, 202, 251, 213, 255, 241, 128 },

-        {   6,  74, 131, 242, 165, 191, 251, 207, 255, 244, 128 },

-        {   1,  41,  72, 198, 111, 151, 249, 185, 255, 248, 128 },

-        {   1,  14,  24,  82,  90, 140, 185,  96, 254, 224, 255 }

+        {  99, 202, 247 },

+        {  96, 165, 234 },

+        { 100,  31, 190 },

+        {  72,   8, 131 },

+        {  41,   1,  72 },

+        {  14,   1,  24 }

       }, { /* Coeff Band 3 */

-        { 118, 200, 248, 254, 228, 216, 254, 222, 255, 213, 128 },

-        {  91, 166, 235, 254, 220, 212, 254, 223, 255, 233, 128 },

-        {  16, 110, 186, 251, 197, 201, 255, 225, 255, 255, 128 },

-        {   3,  72, 124, 239, 160, 186, 253, 209, 255, 239, 128 },

-        {   1,  39,  66, 198, 106, 151, 248, 191, 255, 247, 128 },

-        {   1,  14,  19,  94,  74, 124, 209, 109, 255, 245, 128 }

+        { 108, 204, 248 },

+        { 107, 156, 235 },

+        { 103,  27, 186 },

+        {  71,   4, 124 },

+        {  39,   1,  66 },

+        {  14,   1,  19 }

       }, { /* Coeff Band 4 */

-        { 112, 213, 248, 255, 231, 218, 255, 234, 255, 255, 128 },

-        {  80, 172, 234, 254, 220, 216, 255, 233, 255, 255, 128 },

-        {  11, 112, 182, 251, 195, 204, 255, 231, 255, 224, 128 },

-        {   2,  73, 126, 241, 159, 186, 254, 219, 255, 255, 128 },

-        {   1,  40,  69, 207, 111, 159, 249, 191, 255, 255, 128 },

-        {   1,  16,  24,  83,  78, 138, 230, 134, 255, 239, 128 }

+        { 120, 211, 248 },

+        { 118, 149, 234 },

+        { 107,  19, 182 },

+        {  72,   3, 126 },

+        {  40,   1,  69 },

+        {  16,   1,  24 }

       }, { /* Coeff Band 5 */

-        { 100, 209, 245, 255, 236, 225, 248, 231, 255, 192, 128 },

-        {  65, 164, 232, 255, 226, 221, 255, 240, 255, 255, 128 },

-        {  11, 117, 186, 253, 203, 209, 255, 240, 255, 255, 128 },

-        {   2,  83, 136, 245, 167, 191, 253, 222, 255, 255, 128 },

-        {   1,  55,  88, 213, 122, 157, 248, 182, 255, 255, 128 },

-        {   1,  10,  38,  58,  85,  43, 198, 107, 255, 255, 128 }

+        { 127, 199, 245 },

+        { 122, 125, 232 },

+        { 112,  20, 186 },

+        {  82,   3, 136 },

+        {  55,   1,  88 },

+        {  10,   1,  38 }

};

-static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES] = {

+static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {

   { /* block Type 0 */

     { /* Intra */

       { /* Coeff Band 0 */

-        {   8,  26, 101, 170, 141, 159, 166, 138, 205, 164, 158 },

-        {   2,  25,  67, 119, 124, 152, 121, 123, 189, 145, 175 },

-        {   1,  15,  28,  67, 102, 139,  95, 107, 191, 136, 187 }

+        {  25,   9, 101 },

+        {  25,   2,  67 },

+        {  15,   1,  28 }

       }, { /* Coeff Band 1 */

-        {  22,  73, 118, 160, 137, 157, 175, 132, 242, 184, 229 },

-        {  43,  73, 116, 160, 137, 157, 177, 132, 242, 185, 231 },

-        {  24,  66, 105, 158, 134, 156, 175, 133, 242, 185, 232 },

-        {   9,  54,  85, 150, 126, 153, 175, 132, 242, 185, 231 },

-        {   2,  34,  54, 123, 109, 145, 168, 124, 242, 183, 231 },

-        {   1,  14,  22,  63,  93, 134, 108, 103, 214, 149, 206 }

+        {  67,  30, 118 },

+        {  61,  56, 116 },

+        {  60,  31, 105 },

+        {  52,  11,  85 },

+        {  34,   2,  54 },

+        {  14,   1,  22 }

       }, { /* Coeff Band 2 */

-        {  34, 123, 149, 186, 148, 163, 195, 143, 245, 195, 233 },

-        {  34, 106, 147, 189, 149, 164, 198, 146, 246, 197, 234 },

-        {  10,  81, 123, 186, 143, 162, 200, 147, 246, 198, 235 },

-        {   2,  56,  87, 170, 127, 156, 201, 143, 248, 202, 234 },

-        {   1,  35,  56, 138, 109, 146, 187, 133, 246, 196, 233 },

-        {   1,  17,  27,  80,  93, 135, 136, 109, 229, 168, 215 }

+        { 107,  58, 149 },

+        {  92,  53, 147 },

+        {  78,  14, 123 },

+        {  56,   3,  87 },

+        {  35,   1,  56 },

+        {  17,   1,  27 }

       }, { /* Coeff Band 3 */

-        {  27, 159, 171, 208, 161, 171, 211, 155, 249, 205, 239 },

-        {  17, 119, 162, 213, 160, 172, 218, 160, 250, 210, 238 },

-        {   3,  81, 128, 207, 149, 168, 220, 161, 250, 213, 238 },

-        {   1,  53,  87, 183, 128, 158, 217, 153, 251, 214, 239 },

-        {   1,  31,  52, 143, 106, 145, 199, 137, 249, 205, 235 },

-        {   1,  14,  24,  77,  89, 133, 142, 109, 234, 174, 215 }

+        { 142,  61, 171 },

+        { 111,  30, 162 },

+        {  80,   4, 128 },

+        {  53,   1,  87 },

+        {  31,   1,  52 },

+        {  14,   1,  24 }

       }, { /* Coeff Band 4 */

-        {  24, 189, 200, 224, 177, 178, 221, 164, 250, 212, 234 },

-        {  14, 136, 184, 230, 176, 181, 228, 172, 252, 215, 231 },

-        {   2,  87, 140, 222, 159, 176, 230, 172, 252, 218, 238 },

-        {   1,  54,  90, 193, 130, 161, 223, 160, 252, 217, 241 },

-        {   1,  28,  49, 142, 103, 144, 202, 139, 250, 208, 233 },

-        {   1,  12,  21,  73,  87, 132, 141, 106, 234, 176, 209 }

+        { 171,  73, 200 },

+        { 129,  28, 184 },

+        {  86,   3, 140 },

+        {  54,   1,  90 },

+        {  28,   1,  49 },

+        {  12,   1,  21 }

       }, { /* Coeff Band 5 */

-        {  32, 220, 227, 242, 199, 190, 234, 180, 251, 220, 232 },

-        {  12, 155, 200, 242, 190, 191, 240, 187, 252, 225, 230 },

-        {   1,  90, 144, 231, 164, 180, 240, 184, 253, 229, 239 },

-        {   1,  53,  90, 198, 130, 162, 230, 165, 253, 226, 238 },

-        {   1,  28,  50, 145, 103, 144, 207, 140, 251, 213, 236 },

-        {   1,  13,  22,  74,  88, 132, 142, 107, 233, 176, 216 }

+        { 193, 129, 227 },

+        { 148,  28, 200 },

+        {  90,   2, 144 },

+        {  53,   1,  90 },

+        {  28,   1,  50 },

+        {  13,   1,  22 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        {   5,  61, 234, 230, 183, 183, 212, 164, 241, 199, 205 },

-        {   3,  65, 184, 199, 164, 170, 182, 145, 232, 175, 223 },

-        {   1,  56, 104, 154, 137, 158, 156, 131, 221, 165, 210 }

+        {  60,   7, 234 },

+        {  64,   4, 184 },

+        {  56,   1, 104 }

       }, { /* Coeff Band 1 */

-        {  46, 183, 210, 229, 181, 182, 222, 165, 252, 214, 251 },

-        { 122, 166, 202, 228, 179, 181, 223, 164, 252, 217, 250 },

-        {  49, 125, 177, 225, 172, 179, 223, 163, 252, 215, 253 },

-        {  22,  99, 142, 216, 155, 173, 222, 164, 252, 215, 250 },

-        {   8,  69,  95, 180, 127, 156, 220, 153, 252, 214, 250 },

-        {   2,  38,  51, 112, 109, 144, 159, 118, 243, 184, 232 }

+        { 150, 111, 210 },

+        {  87, 185, 202 },

+        { 101,  81, 177 },

+        {  90,  34, 142 },

+        {  67,  11,  95 },

+        {  38,   2,  51 }

       }, { /* Coeff Band 2 */

-        {  56, 196, 218, 236, 187, 185, 231, 172, 254, 223, 239 },

-        {  38, 141, 195, 235, 182, 185, 233, 174, 254, 225, 232 },

-        {   7,  93, 147, 225, 164, 178, 233, 173, 255, 226, 248 },

-        {   2,  63, 101, 201, 137, 165, 227, 162, 254, 225, 248 },

-        {   1,  39,  61, 159, 110, 148, 213, 146, 254, 218, 247 },

-        {   1,  20,  33,  98,  95, 136, 166, 115, 247, 192, 231 }

+        { 153, 139, 218 },

+        { 120,  72, 195 },

+        {  90,  11, 147 },

+        {  63,   3, 101 },

+        {  39,   1,  61 },

+        {  20,   1,  33 }

       }, { /* Coeff Band 3 */

-        {  44, 206, 223, 240, 193, 189, 235, 177, 255, 231, 224 },

-        {  27, 147, 200, 240, 188, 189, 238, 181, 255, 229, 239 },

-        {   4,  93, 147, 230, 165, 180, 238, 180, 255, 231, 237 },

-        {   1,  58,  95, 201, 134, 164, 229, 164, 255, 228, 254 },

-        {   1,  32,  52, 152, 105, 146, 212, 142, 254, 221, 255 },

-        {   1,  14,  23,  81,  87, 133, 156, 109, 248, 191, 236 }

+        { 171, 132, 223 },

+        { 131,  56, 200 },

+        {  92,   6, 147 },

+        {  58,   1,  95 },

+        {  32,   1,  52 },

+        {  14,   1,  23 }

       }, { /* Coeff Band 4 */

-        {  39, 216, 227, 244, 200, 194, 237, 179, 255, 231, 255 },

-        {  22, 152, 204, 243, 192, 193, 240, 186, 255, 231, 240 },

-        {   2,  92, 148, 232, 167, 183, 239, 182, 255, 232, 255 },

-        {   1,  55,  91, 200, 132, 164, 229, 164, 255, 230, 255 },

-        {   1,  28,  47, 144,  99, 142, 211, 141, 255, 222, 251 },

-        {   1,  13,  21,  75,  86, 131, 152, 103, 249, 193, 242 }

+        { 183, 137, 227 },

+        { 139,  48, 204 },

+        {  91,   3, 148 },

+        {  55,   1,  91 },

+        {  28,   1,  47 },

+        {  13,   1,  21 }

       }, { /* Coeff Band 5 */

-        {  34, 228, 234, 249, 213, 201, 246, 194, 255, 239, 255 },

-        {  13, 161, 208, 247, 198, 197, 248, 197, 255, 243, 255 },

-        {   1,  95, 148, 234, 166, 183, 246, 190, 255, 243, 236 },

-        {   1,  55,  90, 199, 128, 161, 237, 168, 255, 239, 255 },

-        {   1,  30,  51, 147, 102, 144, 218, 142, 255, 232, 254 },

-        {   1,  16,  25,  86,  88, 131, 168, 109, 252, 207, 245 }

+        { 198, 149, 234 },

+        { 153,  32, 208 },

+        {  95,   2, 148 },

+        {  55,   1,  90 },

+        {  30,   1,  51 },

+        {  16,   1,  25 }

   }, { /* block Type 1 */

     { /* Intra */

       { /* Coeff Band 0 */

-        { 204,  33, 217, 233, 185, 184, 199, 165, 204, 163, 162 },

-        {  93,  48, 151, 209, 157, 171, 193, 161, 203, 167, 189 },

-        {  18,  43,  86, 173, 126, 156, 203, 149, 231, 193, 200 }

+        {   7, 209, 217 },

+        {  31, 106, 151 },

+        {  40,  21,  86 }

       }, { /* Coeff Band 1 */

-        {  43, 121, 184, 233, 173, 182, 235, 187, 248, 211, 237 },

-        {  93, 117, 177, 232, 170, 180, 235, 182, 246, 204, 224 },

-        {  33, 101, 158, 229, 165, 179, 235, 182, 245, 207, 236 },

-        {  11,  81, 129, 221, 153, 173, 233, 179, 246, 203, 229 },

-        {   2,  51,  82, 188, 124, 158, 224, 162, 248, 206, 228 },

-        {   1,  18,  29,  88,  93, 137, 141, 116, 222, 161, 217 }

+        { 101,  71, 184 },

+        {  74, 131, 177 },

+        {  88,  50, 158 },

+        {  78,  16, 129 },

+        {  51,   2,  82 },

+        {  18,   1,  29 }

       }, { /* Coeff Band 2 */

-        {  63, 154, 199, 239, 184, 187, 236, 187, 248, 209, 221 },

-        {  53, 128, 191, 239, 182, 188, 236, 188, 251, 209, 255 },

-        {  14,  99, 160, 235, 172, 184, 235, 187, 249, 207, 240 },

-        {   4,  75, 122, 219, 150, 173, 226, 177, 250, 204, 240 },

-        {   1,  47,  77, 176, 121, 154, 207, 153, 245, 197, 237 },

-        {   1,  18,  30,  84,  95, 136, 138, 112, 229, 167, 228 }

+        { 116, 115, 199 },

+        { 102,  88, 191 },

+        {  94,  22, 160 },

+        {  74,   6, 122 },

+        {  47,   1,  77 },

+        {  18,   1,  30 }

       }, { /* Coeff Band 3 */

-        {  48, 193, 210, 245, 194, 194, 241, 196, 252, 213, 255 },

-        {  26, 145, 201, 245, 194, 196, 240, 195, 251, 215, 240 },

-        {   6, 104, 165, 241, 179, 190, 239, 191, 253, 222, 255 },

-        {   1,  73, 120, 218, 151, 174, 227, 172, 251, 219, 248 },

-        {   1,  42,  69, 167, 118, 153, 205, 146, 251, 206, 245 },

-        {   1,  16,  27,  84,  89, 133, 148, 112, 240, 179, 238 }

+        { 157, 124, 210 },

+        { 130,  53, 201 },

+        { 102,  10, 165 },

+        {  73,   1, 120 },

+        {  42,   1,  69 },

+        {  16,   1,  27 }

       }, { /* Coeff Band 4 */

-        {  47, 213, 225, 248, 203, 199, 240, 194, 254, 211, 255 },

-        {  32, 153, 212, 248, 201, 199, 241, 196, 251, 226, 255 },

-        {   6, 102, 168, 240, 181, 190, 240, 187, 251, 225, 238 },

-        {   1,  66, 111, 211, 146, 169, 229, 167, 255, 224, 244 },

-        {   1,  36,  60, 157, 110, 148, 209, 143, 252, 215, 255 },

-        {   1,  16,  27,  83,  90, 133, 152, 111, 244, 184, 250 }

+        { 174, 147, 225 },

+        { 134,  67, 212 },

+        { 100,  10, 168 },

+        {  66,   1, 111 },

+        {  36,   1,  60 },

+        {  16,   1,  27 }

       }, { /* Coeff Band 5 */

-        {  46, 225, 232, 252, 219, 208, 247, 204, 254, 233, 255 },

-        {  24, 162, 214, 250, 208, 204, 247, 201, 254, 236, 255 },

-        {   3, 106, 165, 242, 182, 191, 245, 196, 255, 231, 255 },

-        {   1,  66, 108, 213, 142, 169, 235, 175, 255, 226, 247 },

-        {   1,  35,  59, 158, 108, 147, 216, 146, 254, 220, 255 },

-        {   1,  16,  27,  85,  90, 131, 159, 110, 248, 191, 252 }

+        { 185, 165, 232 },

+        { 147,  56, 214 },

+        { 105,   5, 165 },

+        {  66,   1, 108 },

+        {  35,   1,  59 },

+        {  16,   1,  27 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        { 229,  28, 245, 227, 195, 182, 200, 145, 253, 186, 255 },

-        { 151,  44, 210, 214, 180, 175, 193, 146, 247, 185, 254 },

-        {  55,  48, 131, 183, 148, 163, 194, 138, 249, 201, 246 }

+        {   3, 232, 245 },

+        {  18, 162, 210 },

+        {  38,  64, 131 }

       }, { /* Coeff Band 1 */

-        { 126, 165, 239, 250, 206, 204, 248, 193, 255, 255, 128 },

-        { 199, 158, 231, 248, 206, 198, 247, 200, 243, 255, 255 },

-        { 102, 136, 209, 248, 203, 197, 247, 201, 255, 244, 128 },

-        {  64, 116, 181, 245, 185, 196, 248, 201, 255, 233, 128 },

-        {  44,  98, 151, 233, 162, 179, 248, 195, 255, 242, 128 },

-        {  44,  81, 119, 204, 140, 165, 222, 163, 252, 217, 255 }

+        {  84, 187, 239 },

+        {  35, 231, 231 },

+        {  82, 150, 209 },

+        {  87,  97, 181 },

+        {  81,  64, 151 },

+        {  67,  60, 119 }

       }, { /* Coeff Band 2 */

-        { 108, 185, 239, 252, 216, 209, 248, 205, 255, 230, 128 },

-        {  91, 155, 224, 252, 211, 205, 251, 211, 255, 230, 128 },

-        {  20, 116, 185, 248, 194, 196, 252, 206, 255, 255, 128 },

-        {   8,  86, 141, 239, 168, 185, 248, 196, 255, 247, 128 },

-        {   3,  50,  92, 206, 125, 164, 242, 176, 255, 246, 128 },

-        {   1,  21,  40, 131,  85, 141, 200, 131, 247, 236, 255 }

+        { 107, 185, 239 },

+        { 100, 149, 224 },

+        { 107,  34, 185 },

+        {  83,  12, 141 },

+        {  49,   4,  92 },

+        {  21,   1,  40 }

       }, { /* Coeff Band 3 */

-        {  94, 198, 243, 254, 226, 215, 254, 220, 255, 255, 128 },

-        {  67, 164, 228, 253, 217, 208, 250, 216, 255, 213, 128 },

-        {  14, 120, 185, 250, 196, 205, 248, 205, 255, 255, 128 },

-        {   4,  83, 134, 238, 161, 181, 250, 202, 255, 233, 128 },

-        {   1,  48,  82, 196, 119, 157, 248, 178, 255, 255, 128 },

-        {   1,  26,  38,  96,  84, 132, 221, 110, 255, 209, 128 }

+        { 125, 184, 243 },

+        { 121, 127, 228 },

+        { 113,  25, 185 },

+        {  82,   6, 134 },

+        {  48,   1,  82 },

+        {  26,   1,  38 }

       }, { /* Coeff Band 4 */

-        {  82, 210, 245, 255, 230, 215, 246, 221, 255, 255, 128 },

-        {  55, 170, 231, 254, 222, 213, 255, 220, 255, 255, 128 },

-        {   8, 118, 184, 251, 200, 207, 255, 219, 255, 255, 128 },

-        {   2,  78, 126, 239, 156, 185, 251, 216, 255, 255, 128 },

-        {   1,  43,  68, 189, 108, 151, 247, 187, 255, 228, 128 },

-        {   1,  34,  40, 121, 114, 102, 205,  96, 255, 255, 128 }

+        { 143, 185, 245 },

+        { 133, 115, 231 },

+        { 114,  14, 184 },

+        {  77,   3, 126 },

+        {  43,   1,  68 },

+        {  34,   1,  40 }

       }, { /* Coeff Band 5 */

-        {  65, 228, 241, 255, 231, 214, 253, 222, 255, 255, 128 },

-        {  33, 173, 226, 254, 222, 216, 255, 231, 255, 255, 128 },

-        {   5, 120, 180, 251, 197, 205, 251, 226, 255, 233, 128 },

-        {   1,  81, 130, 240, 159, 187, 251, 206, 255, 205, 128 },

-        {   1,  51,  78, 198, 119, 168, 238, 181, 255, 171, 128 },

-        {   1,  18,  49, 183, 119, 160, 255, 171, 128, 128, 128 }

+        { 170, 194, 241 },

+        { 151,  80, 226 },

+        { 118,   9, 180 },

+        {  81,   1, 130 },

+        {  51,   1,  78 },

+        {  18,   1,  49 }

};

-static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES] = {

+static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {

   { /* block Type 0 */

     { /* Intra */

       { /* Coeff Band 0 */

-        {  37,  34, 137, 205, 154, 170, 151, 159, 109, 172,  44 },

-        {   3,  26,  60, 113, 123, 154, 100, 124, 152, 131, 144 },

-        {   1,  13,  23,  54, 102, 139,  71, 106, 146, 123, 148 }

+        {  29,  42, 137 },

+        {  26,   3,  60 },

+        {  13,   1,  23 }

       }, { /* Coeff Band 1 */

-        {  26,  77, 122, 152, 144, 160, 143, 129, 216, 158, 201 },

-        {  43,  76, 123, 152, 142, 159, 145, 129, 218, 160, 204 },

-        {  25,  67, 112, 150, 141, 159, 144, 128, 218, 159, 204 },

-        {   9,  54,  90, 143, 134, 156, 144, 127, 218, 159, 204 },

-        {   2,  32,  52, 116, 114, 148, 138, 123, 217, 158, 207 },

-        {   1,  10,  15,  44,  91, 133,  75,  99, 172, 128, 169 }

+        {  69,  36, 122 },

+        {  63,  57, 123 },

+        {  60,  33, 112 },

+        {  52,  11,  90 },

+        {  32,   2,  52 },

+        {  10,   1,  15 }

       }, { /* Coeff Band 2 */

-        {  32, 122, 143, 163, 145, 161, 162, 131, 226, 171, 206 },

-        {  46, 105, 143, 168, 148, 161, 165, 133, 228, 174, 204 },

-        {  17,  79, 116, 164, 142, 161, 166, 134, 229, 174, 206 },

-        {   4,  53,  78, 143, 125, 153, 163, 129, 232, 175, 213 },

-        {   1,  29,  44, 105, 105, 142, 147, 120, 228, 168, 211 },

-        {   1,  12,  18,  52,  91, 133,  92, 100, 193, 140, 183 }

+        { 107,  55, 143 },

+        {  86,  69, 143 },

+        {  74,  24, 116 },

+        {  52,   5,  78 },

+        {  29,   1,  44 },

+        {  12,   1,  18 }

       }, { /* Coeff Band 3 */

-        {  33, 157, 160, 182, 149, 163, 185, 141, 236, 185, 218 },

-        {  20, 116, 152, 188, 152, 165, 191, 144, 238, 188, 217 },

-        {   4,  74, 114, 180, 141, 162, 192, 143, 240, 191, 219 },

-        {   1,  44,  69, 148, 119, 151, 183, 134, 243, 192, 227 },

-        {   1,  25,  40, 110, 101, 141, 162, 121, 238, 181, 223 },

-        {   1,  12,  18,  56,  89, 132, 103, 101, 206, 148, 196 }

+        { 137,  71, 160 },

+        { 107,  34, 152 },

+        {  73,   6, 114 },

+        {  44,   1,  69 },

+        {  25,   1,  40 },

+        {  12,   1,  18 }

       }, { /* Coeff Band 4 */

-        {  25, 183, 174, 207, 159, 171, 205, 156, 243, 194, 228 },

-        {  13, 124, 159, 209, 157, 171, 213, 160, 243, 200, 228 },

-        {   2,  75, 117, 199, 143, 166, 215, 158, 246, 205, 230 },

-        {   1,  45,  73, 165, 119, 153, 204, 144, 248, 205, 231 },

-        {   1,  26,  43, 120, 101, 141, 178, 127, 242, 192, 226 },

-        {   1,  12,  19,  59,  89, 132, 112, 102, 215, 154, 201 }

+        { 165,  70, 174 },

+        { 118,  24, 159 },

+        {  74,   3, 117 },

+        {  45,   1,  73 },

+        {  26,   1,  43 },

+        {  12,   1,  19 }

       }, { /* Coeff Band 5 */

-        {  13, 232, 223, 239, 196, 188, 225, 172, 248, 209, 226 },

-        {   4, 155, 187, 237, 184, 187, 233, 180, 250, 216, 232 },

-        {   1,  86, 131, 222, 156, 175, 233, 176, 251, 218, 237 },

-        {   1,  49,  79, 181, 123, 157, 218, 155, 251, 214, 237 },

-        {   1,  26,  43, 125, 100, 141, 188, 130, 246, 199, 231 },

-        {   1,  12,  20,  62,  88, 131, 119, 102, 222, 161, 209 }

+        { 220,  93, 223 },

+        { 153,  10, 187 },

+        {  86,   2, 131 },

+        {  49,   1,  79 },

+        {  26,   1,  43 },

+        {  12,   1,  20 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        {  51,  37, 227, 237, 205, 184, 200, 162, 231, 187, 207 },

-        {   9,  36, 172, 204, 176, 173, 171, 145, 217, 167, 197 },

-        {  21,  26, 112, 162, 145, 162, 155, 133, 215, 165, 191 }

+        {  30,  58, 227 },

+        {  35,  10, 172 },

+        {  24,  23, 112 }

       }, { /* Coeff Band 1 */

-        {  79, 169, 219, 223, 176, 177, 222, 161, 248, 213, 244 },

-        { 177, 166, 216, 222, 175, 178, 222, 161, 246, 212, 226 },

-        { 119, 141, 196, 222, 174, 176, 220, 163, 250, 212, 236 },

-        {  63, 117, 165, 217, 163, 175, 218, 161, 248, 209, 231 },

-        {  30,  87, 117, 192, 138, 162, 216, 157, 247, 211, 224 },

-        {  14,  56,  60, 119, 111, 146, 156, 123, 227, 171, 220 }

+        { 117, 145, 219 },

+        {  51, 221, 216 },

+        {  75, 169, 196 },

+        {  88,  96, 165 },

+        {  77,  43, 117 },

+        {  53,  18,  60 }

       }, { /* Coeff Band 2 */

-        {  88, 195, 225, 229, 181, 181, 229, 171, 252, 212, 221 },

-        {  66, 145, 202, 229, 177, 180, 230, 172, 253, 220, 255 },

-        {  12,  97, 152, 221, 162, 174, 230, 169, 253, 218, 249 },

-        {   3,  66, 103, 198, 138, 165, 223, 159, 253, 219, 251 },

-        {   1,  38,  61, 158, 110, 148, 209, 146, 252, 212, 238 },

-        {   1,  19,  30,  94,  94, 136, 160, 114, 244, 185, 236 }

+        { 128, 176, 225 },

+        { 108, 114, 202 },

+        {  92,  19, 152 },

+        {  65,   4, 103 },

+        {  38,   1,  61 },

+        {  19,   1,  30 }

       }, { /* Coeff Band 3 */

-        {  79, 211, 228, 235, 186, 184, 233, 176, 255, 225, 255 },

-        {  50, 151, 205, 235, 182, 185, 237, 177, 254, 223, 255 },

-        {   7,  95, 149, 225, 162, 176, 236, 177, 254, 229, 219 },

-        {   1,  62,  98, 198, 134, 164, 228, 162, 254, 224, 238 },

-        {   1,  35,  57, 156, 108, 148, 211, 143, 253, 215, 238 },

-        {   1,  17,  26,  87,  89, 135, 161, 113, 246, 189, 237 }

+        { 146, 184, 228 },

+        { 122,  95, 205 },

+        {  92,  11, 149 },

+        {  62,   1,  98 },

+        {  35,   1,  57 },

+        {  17,   1,  26 }

       }, { /* Coeff Band 4 */

-        {  68, 225, 230, 239, 190, 187, 238, 180, 252, 234, 255 },

-        {  39, 156, 206, 239, 185, 187, 241, 187, 254, 231, 255 },

-        {   4,  94, 147, 229, 163, 178, 242, 183, 255, 236, 224 },

-        {   1,  58,  94, 200, 132, 163, 232, 166, 254, 230, 255 },

-        {   1,  32,  52, 153, 104, 146, 214, 144, 253, 222, 236 },

-        {   1,  15,  24,  84,  89, 131, 159, 109, 247, 192, 240 }

+        { 165, 192, 230 },

+        { 132,  81, 206 },

+        {  93,   6, 147 },

+        {  58,   1,  94 },

+        {  32,   1,  52 },

+        {  15,   1,  24 }

       }, { /* Coeff Band 5 */

-        {  45, 248, 234, 248, 208, 198, 244, 193, 255, 233, 255 },

-        {  19, 169, 204, 246, 195, 195, 246, 199, 255, 233, 255 },

-        {   2,  98, 145, 235, 166, 183, 245, 192, 255, 235, 255 },

-        {   1,  59,  92, 205, 131, 164, 236, 172, 254, 231, 250 },

-        {   1,  33,  52, 152, 103, 145, 216, 144, 253, 221, 240 },

-        {   1,  15,  24,  83,  87, 133, 156, 110, 246, 191, 242 }

+        { 204, 223, 234 },

+        { 156,  49, 204 },

+        {  97,   3, 145 },

+        {  59,   1,  92 },

+        {  33,   1,  52 },

+        {  15,   1,  24 }

   }, { /* block Type 1 */

     { /* Intra */

       { /* Coeff Band 0 */

-        { 179,  23, 200, 222, 180, 182, 150, 152, 148, 135, 125 },

-        {  60,  33, 113, 185, 143, 166, 168, 144, 189, 168, 152 },

-        {   8,  31,  59, 137, 114, 150, 163, 132, 206, 171, 169 }

+        {   7, 184, 200 },

+        {  25,  67, 113 },

+        {  30,   9,  59 }

       }, { /* Coeff Band 1 */

-        {  27, 103, 158, 215, 157, 174, 209, 165, 239, 191, 233 },

-        {  90, 101, 159, 213, 156, 173, 212, 164, 230, 185, 237 },

-        {  39,  91, 146, 212, 155, 169, 212, 165, 232, 186, 207 },

-        {  16,  75, 120, 203, 144, 169, 210, 161, 233, 189, 227 },

-        {   3,  48,  76, 167, 120, 154, 199, 146, 236, 190, 218 },

-        {   1,  18,  26,  72,  95, 137, 113, 109, 197, 146, 186 }

+        {  92,  42, 158 },

+        {  65, 121, 159 },

+        {  77,  56, 146 },

+        {  70,  22, 120 },

+        {  47,   4,  76 },

+        {  18,   1,  26 }

       }, { /* Coeff Band 2 */

-        {  45, 137, 177, 218, 166, 174, 206, 163, 234, 184, 214 },

-        {  47, 117, 167, 218, 166, 176, 206, 164, 234, 182, 229 },

-        {  16,  90, 136, 211, 153, 172, 205, 162, 236, 192, 231 },

-        {   6,  65, 100, 188, 136, 162, 193, 155, 237, 177, 228 },

-        {   1,  37,  58, 137, 113, 150, 166, 134, 229, 167, 234 },

-        {   1,  13,  19,  55,  90, 132,  93, 103, 196, 137, 202 }

+        { 113,  81, 177 },

+        {  96,  75, 167 },

+        {  84,  24, 136 },

+        {  63,   8, 100 },

+        {  37,   1,  58 },

+        {  13,   1,  19 }

       }, { /* Coeff Band 3 */

-        {  36, 171, 194, 227, 177, 179, 208, 165, 244, 196, 245 },

-        {  19, 129, 178, 227, 175, 184, 214, 165, 246, 188, 255 },

-        {   5,  90, 139, 217, 158, 174, 213, 166, 246, 198, 255 },

-        {   1,  59,  93, 182, 134, 162, 193, 150, 242, 188, 241 },

-        {   1,  31,  49, 122, 108, 145, 160, 127, 235, 172, 229 },

-        {   1,  10,  18,  54,  89, 132, 101,  99, 213, 144, 217 }

+        { 147,  85, 194 },

+        { 119,  36, 178 },

+        {  88,   8, 139 },

+        {  59,   1,  93 },

+        {  31,   1,  49 },

+        {  10,   1,  18 }

       }, { /* Coeff Band 4 */

-        {  37, 197, 210, 233, 187, 186, 216, 172, 250, 202, 255 },

-        {  20, 142, 191, 234, 183, 186, 219, 170, 249, 207, 246 },

-        {   3,  93, 144, 222, 163, 176, 219, 170, 249, 204, 224 },

-        {   1,  56,  88, 179, 130, 159, 199, 148, 246, 197, 243 },

-        {   1,  29,  47, 123, 104, 144, 172, 127, 244, 185, 234 },

-        {   1,  14,  22,  66,  91, 130, 120, 103, 225, 158, 221 }

+        { 169, 108, 210 },

+        { 131,  41, 191 },

+        {  92,   5, 144 },

+        {  56,   1,  88 },

+        {  29,   1,  47 },

+        {  14,   1,  22 }

       }, { /* Coeff Band 5 */

-        {  19, 227, 223, 245, 203, 194, 238, 187, 251, 225, 217 },

-        {   6, 152, 192, 242, 189, 190, 241, 190, 253, 225, 255 },

-        {   1,  89, 138, 228, 161, 177, 239, 181, 254, 224, 248 },

-        {   1,  52,  84, 188, 127, 157, 224, 159, 253, 222, 247 },

-        {   1,  29,  47, 132, 102, 140, 196, 132, 251, 208, 244 },

-        {   1,  14,  23,  71,  90, 133, 134, 103, 239, 174, 233 }

+        { 210, 106, 223 },

+        { 148,  14, 192 },

+        {  89,   2, 138 },

+        {  52,   1,  84 },

+        {  29,   1,  47 },

+        {  14,   1,  23 }

     }, { /* Inter */

       { /* Coeff Band 0 */

-        { 205,  14, 245, 235, 216, 189, 190, 146, 249, 201, 255 },

-        {  97,  19, 213, 210, 194, 174, 176, 139, 241, 183, 250 },

-        {  31,  20, 144, 183, 160, 167, 171, 132, 240, 184, 253 }

+        {   3, 207, 245 },

+        {  12, 102, 213 },

+        {  18,  33, 144 }

       }, { /* Coeff Band 1 */

-        { 137, 182, 245, 254, 221, 216, 255, 160, 128, 128, 128 },

-        { 231, 185, 242, 251, 218, 205, 255, 233, 128, 128, 128 },

-        { 170, 175, 229, 252, 205, 209, 255, 211, 128, 128, 128 },

-        { 107, 157, 213, 250, 199, 205, 251, 207, 255, 255, 128 },

-        {  77, 126, 183, 243, 182, 183, 252, 206, 255, 255, 128 },

-        {  69,  96, 149, 229, 157, 170, 247, 169, 255, 255, 128 }

+        {  85, 205, 245 },

+        {  18, 249, 242 },

+        {  59, 221, 229 },

+        {  91, 166, 213 },

+        {  88, 117, 183 },

+        {  70,  95, 149 }

       }, { /* Coeff Band 2 */

-        { 107, 196, 241, 252, 211, 208, 255, 210, 128, 128, 128 },

-        {  92, 162, 221, 249, 203, 195, 255, 199, 128, 128, 128 },

-        {  20, 108, 181, 244, 190, 191, 250, 200, 255, 255, 128 },

-        {   7,  80, 132, 241, 172, 197, 253, 191, 255, 255, 128 },

-        {   2,  43,  75, 219, 122, 150, 255, 203, 128, 128, 128 },

-        {   1,  15,  48,  98,  51, 192, 255, 160, 128, 128, 128 }

+        { 114, 193, 241 },

+        { 104, 155, 221 },

+        { 100,  33, 181 },

+        {  78,  10, 132 },

+        {  43,   2,  75 },

+        {  15,   1,  48 }

       }, { /* Coeff Band 3 */

-        { 107, 202, 244, 254, 226, 215, 255, 192, 128, 128, 128 },

-        {  77, 167, 224, 252, 215, 212, 255, 235, 128, 128, 128 },

-        {  14, 117, 179, 249, 191, 196, 255, 212, 128, 128, 128 },

-        {   3,  84, 134, 237, 160, 194, 248, 216, 255, 255, 128 },

-        {   1,  57,  84, 216, 145, 136, 255, 161, 128, 128, 128 },

-        {   1,   1,   1, 255, 128, 255, 128, 128, 128, 128, 128 }

+        { 118, 198, 244 },

+        { 117, 142, 224 },

+        { 111,  25, 179 },

+        {  83,   4, 134 },

+        {  57,   1,  84 },

+        {   1,   1,   1 }

       }, { /* Coeff Band 4 */

-        {  88, 219, 248, 255, 239, 225, 255, 255, 128, 128, 128 },

-        {  61, 178, 234, 255, 227, 227, 255, 217, 128, 128, 128 },

-        {   6, 127, 188, 252, 201, 211, 255, 244, 128, 128, 128 },

-        {   1,  83, 130, 248, 173, 197, 255, 175, 128, 128, 128 },

-        {   1,  61,  66, 211, 121, 188, 255, 213, 128, 128, 128 },

-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+        { 144, 201, 248 },

+        { 136, 130, 234 },

+        { 124,  12, 188 },

+        {  83,   1, 130 },

+        {  61,   1,  66 },

+        {  64, 171, 128 }

       }, { /* Coeff Band 5 */

-        {  73, 243, 250, 255, 244, 220, 255, 205, 128, 128, 128 },

-        {  42, 197, 242, 255, 237, 227, 242, 166, 255, 255, 128 },

-        {  10, 137, 197, 252, 214, 199, 255, 238, 128, 128, 128 },

-        {   2,  85, 134, 242, 163, 185, 224, 238, 255, 255, 128 },

-        {   1,  70,  69, 199, 110,  64, 255, 213, 128, 128, 128 },

-        {   1,   1,   1,   1, 128, 128, 255,   1, 128, 128, 128 }

+        { 174, 227, 250 },

+        { 165, 118, 242 },

+        { 132,  21, 197 },

+        {  84,   3, 134 },

+        {  70,   1,  69 },

+        {   1,   1,   1 }

};

-#if CONFIG_CODE_NONZEROCOUNT

-// TODO(debargha): Remove the macro and count tables after experimentation

-#define NZC_DEFAULT_COUNTS  /* Uncomment to use counts as defaults */

-#ifdef NZC_DEFAULT_COUNTS

-static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]

-                                                [REF_TYPES]

-                                                [BLOCK_TYPES]

-                                                [NZC4X4_TOKENS] = {

-  {

-    {

-      { 967652, 29023, 15039, 6952, 1568, 116 },

-      { 289116, 22938, 4522, 1935, 520, 47 }

-    }, {

-      { 967652, 29023, 15039, 6952, 1568, 116 },

-      { 689116, 22938, 4522, 1935, 520, 47 }

-    },

-  }, {

-    {

-      { 124684, 37167, 15270, 8483, 1777, 102 },

-      { 10405, 12395, 3401, 3574, 2461, 771 }

-    }, {

-      { 124684, 37167, 15270, 8483, 1777, 102 },

-      { 20405, 12395, 3401, 3574, 2461, 771 }

+#else

+static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {

+  { /* block Type 0 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 195,  29, 183 },

+        {  84,  49, 136 },

+        {   8,  42,  71 }

+      }, { /* Coeff Band 1 */

+        {  31, 107, 169 },

+        {  35,  99, 159 },

+        {  17,  82, 140 },

+        {   8,  66, 114 },

+        {   2,  44,  76 },

+        {   1,  19,  32 }

+      }, { /* Coeff Band 2 */

+        {  40, 132, 201 },

+        {  29, 114, 187 },

+        {  13,  91, 157 },

+        {   7,  75, 127 },

+        {   3,  58,  95 },

+        {   1,  28,  47 }

+      }, { /* Coeff Band 3 */

+        {  69, 142, 221 },

+        {  42, 122, 201 },

+        {  15,  91, 159 },

+        {   6,  67, 121 },

+        {   1,  42,  77 },

+        {   1,  17,  31 }

+      }, { /* Coeff Band 4 */

+        { 102, 148, 228 },

+        {  67, 117, 204 },

+        {  17,  82, 154 },

+        {   6,  59, 114 },

+        {   2,  39,  75 },

+        {   1,  15,  29 }

+      }, { /* Coeff Band 5 */

+        { 156,  57, 233 },

+        { 119,  57, 212 },

+        {  58,  48, 163 },

+        {  29,  40, 124 },

+        {  12,  30,  81 },

+        {   3,  12,  31 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 191, 107, 226 },

+        { 124, 117, 204 },

+        {  25,  99, 155 }

+      }, { /* Coeff Band 1 */

+        {  29, 148, 210 },

+        {  37, 126, 194 },

+        {   8,  93, 157 },

+        {   2,  68, 118 },

+        {   1,  39,  69 },

+        {   1,  17,  33 }

+      }, { /* Coeff Band 2 */

+        {  41, 151, 213 },

+        {  27, 123, 193 },

+        {   3,  82, 144 },

+        {   1,  58, 105 },

+        {   1,  32,  60 },

+        {   1,  13,  26 }

+      }, { /* Coeff Band 3 */

+        {  59, 159, 220 },

+        {  23, 126, 198 },

+        {   4,  88, 151 },

+        {   1,  66, 114 },

+        {   1,  38,  71 },

+        {   1,  18,  34 }

+      }, { /* Coeff Band 4 */

+        { 114, 136, 232 },

+        {  51, 114, 207 },

+        {  11,  83, 155 },

+        {   3,  56, 105 },

+        {   1,  33,  65 },

+        {   1,  17,  34 }

+      }, { /* Coeff Band 5 */

+        { 149,  65, 234 },

+        { 121,  57, 215 },

+        {  61,  49, 166 },

+        {  28,  36, 114 },

+        {  12,  25,  76 },

+        {   3,  16,  42 }

+      }

-  }, {

-    {

-      { 4100, 22976, 15627, 16137, 7982, 1793 },

-      { 4249, 3084, 2131, 4081, 6439, 1653 }

-    }, {

-      { 21100, 22976, 15627, 16137, 7982, 1793 },

-      { 4249, 3084, 2131, 4081, 2439, 1653 }

+  }, { /* block Type 1 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 214,  49, 220 },

+        { 132,  63, 188 },

+        {  42,  65, 137 }

+      }, { /* Coeff Band 1 */

+        {  85, 137, 221 },

+        { 104, 131, 216 },

+        {  49, 111, 192 },

+        {  21,  87, 155 },

+        {   2,  49,  87 },

+        {   1,  16,  28 }

+      }, { /* Coeff Band 2 */

+        {  89, 163, 230 },

+        {  90, 137, 220 },

+        {  29, 100, 183 },

+        {  10,  70, 135 },

+        {   2,  42,  81 },

+        {   1,  17,  33 }

+      }, { /* Coeff Band 3 */

+        { 108, 167, 237 },

+        {  55, 133, 222 },

+        {  15,  97, 179 },

+        {   4,  72, 135 },

+        {   1,  45,  85 },

+        {   1,  19,  38 }

+      }, { /* Coeff Band 4 */

+        { 124, 146, 240 },

+        {  66, 124, 224 },

+        {  17,  88, 175 },

+        {   4,  58, 122 },

+        {   1,  36,  75 },

+        {   1,  18,  37 }

+      }, { /* Coeff Band 5 */

+        { 141,  79, 241 },

+        { 126,  70, 227 },

+        {  66,  58, 182 },

+        {  30,  44, 136 },

+        {  12,  34,  96 },

+        {   2,  20,  47 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 229,  99, 249 },

+        { 143, 111, 235 },

+        {  46, 109, 192 }

+      }, { /* Coeff Band 1 */

+        {  82, 158, 236 },

+        {  94, 146, 224 },

+        {  25, 117, 191 },

+        {   9,  87, 149 },

+        {   3,  56,  99 },

+        {   1,  33,  57 }

+      }, { /* Coeff Band 2 */

+        {  83, 167, 237 },

+        {  68, 145, 222 },

+        {  10, 103, 177 },

+        {   2,  72, 131 },

+        {   1,  41,  79 },

+        {   1,  20,  39 }

+      }, { /* Coeff Band 3 */

+        {  99, 167, 239 },

+        {  47, 141, 224 },

+        {  10, 104, 178 },

+        {   2,  73, 133 },

+        {   1,  44,  85 },

+        {   1,  22,  47 }

+      }, { /* Coeff Band 4 */

+        { 127, 145, 243 },

+        {  71, 129, 228 },

+        {  17,  93, 177 },

+        {   3,  61, 124 },

+        {   1,  41,  84 },

+        {   1,  21,  52 }

+      }, { /* Coeff Band 5 */

+        { 157,  78, 244 },

+        { 140,  72, 231 },

+        {  69,  58, 184 },

+        {  31,  44, 137 },

+        {  14,  38, 105 },

+        {   8,  23,  61 }

+      }

};

-static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]

-                                                [REF_TYPES]

-                                                [BLOCK_TYPES]

-                                                [NZC8X8_TOKENS] = {

-  {

-    {

-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },

-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 },

-    }, {

-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },

-      { 192052, 30468, 6973, 3250, 1500, 750, 375, 5 },

+static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {

+  { /* block Type 0 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 125,  34, 187 },

+        {  52,  41, 133 },

+        {   6,  31,  56 }

+      }, { /* Coeff Band 1 */

+        {  37, 109, 153 },

+        {  51, 102, 147 },

+        {  23,  87, 128 },

+        {   8,  67, 101 },

+        {   1,  41,  63 },

+        {   1,  19,  29 }

+      }, { /* Coeff Band 2 */

+        {  31, 154, 185 },

+        {  17, 127, 175 },

+        {   6,  96, 145 },

+        {   2,  73, 114 },

+        {   1,  51,  82 },

+        {   1,  28,  45 }

+      }, { /* Coeff Band 3 */

+        {  23, 163, 200 },

+        {  10, 131, 185 },

+        {   2,  93, 148 },

+        {   1,  67, 111 },

+        {   1,  41,  69 },

+        {   1,  14,  24 }

+      }, { /* Coeff Band 4 */

+        {  29, 176, 217 },

+        {  12, 145, 201 },

+        {   3, 101, 156 },

+        {   1,  69, 111 },

+        {   1,  39,  63 },

+        {   1,  14,  23 }

+      }, { /* Coeff Band 5 */

+        {  57, 192, 233 },

+        {  25, 154, 215 },

+        {   6, 109, 167 },

+        {   3,  78, 118 },

+        {   1,  48,  69 },

+        {   1,  21,  29 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 202, 105, 245 },

+        { 108, 106, 216 },

+        {  18,  90, 144 }

+      }, { /* Coeff Band 1 */

+        {  33, 172, 219 },

+        {  64, 149, 206 },

+        {  14, 117, 177 },

+        {   5,  90, 141 },

+        {   2,  61,  95 },

+        {   1,  37,  57 }

+      }, { /* Coeff Band 2 */

+        {  33, 179, 220 },

+        {  11, 140, 198 },

+        {   1,  89, 148 },

+        {   1,  60, 104 },

+        {   1,  33,  57 },

+        {   1,  12,  21 }

+      }, { /* Coeff Band 3 */

+        {  30, 181, 221 },

+        {   8, 141, 198 },

+        {   1,  87, 145 },

+        {   1,  58, 100 },

+        {   1,  31,  55 },

+        {   1,  12,  20 }

+      }, { /* Coeff Band 4 */

+        {  32, 186, 224 },

+        {   7, 142, 198 },

+        {   1,  86, 143 },

+        {   1,  58, 100 },

+        {   1,  31,  55 },

+        {   1,  12,  22 }

+      }, { /* Coeff Band 5 */

+        {  57, 192, 227 },

+        {  20, 143, 204 },

+        {   3,  96, 154 },

+        {   1,  68, 112 },

+        {   1,  42,  69 },

+        {   1,  19,  32 }

+      }

-  }, {

-    {

-      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },

-      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },

-    }, {

-      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },

-      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },

+  }, { /* block Type 1 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 212,  35, 215 },

+        { 113,  47, 169 },

+        {  29,  48, 105 }

+      }, { /* Coeff Band 1 */

+        {  74, 129, 203 },

+        { 106, 120, 203 },

+        {  49, 107, 178 },

+        {  19,  84, 144 },

+        {   4,  50,  84 },

+        {   1,  15,  25 }

+      }, { /* Coeff Band 2 */

+        {  71, 172, 217 },

+        {  44, 141, 209 },

+        {  15, 102, 173 },

+        {   6,  76, 133 },

+        {   2,  51,  89 },

+        {   1,  24,  42 }

+      }, { /* Coeff Band 3 */

+        {  64, 185, 231 },

+        {  31, 148, 216 },

+        {   8, 103, 175 },

+        {   3,  74, 131 },

+        {   1,  46,  81 },

+        {   1,  18,  30 }

+      }, { /* Coeff Band 4 */

+        {  65, 196, 235 },

+        {  25, 157, 221 },

+        {   5, 105, 174 },

+        {   1,  67, 120 },

+        {   1,  38,  69 },

+        {   1,  15,  30 }

+      }, { /* Coeff Band 5 */

+        {  65, 204, 238 },

+        {  30, 156, 224 },

+        {   7, 107, 177 },

+        {   2,  70, 124 },

+        {   1,  42,  73 },

+        {   1,  18,  34 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 225,  86, 251 },

+        { 144, 104, 235 },

+        {  42,  99, 181 }

+      }, { /* Coeff Band 1 */

+        {  85, 175, 239 },

+        { 112, 165, 229 },

+        {  29, 136, 200 },

+        {  12, 103, 162 },

+        {   6,  77, 123 },

+        {   2,  53,  84 }

+      }, { /* Coeff Band 2 */

+        {  75, 183, 239 },

+        {  30, 155, 221 },

+        {   3, 106, 171 },

+        {   1,  74, 128 },

+        {   1,  44,  76 },

+        {   1,  17,  28 }

+      }, { /* Coeff Band 3 */

+        {  73, 185, 240 },

+        {  27, 159, 222 },

+        {   2, 107, 172 },

+        {   1,  75, 127 },

+        {   1,  42,  73 },

+        {   1,  17,  29 }

+      }, { /* Coeff Band 4 */

+        {  62, 190, 238 },

+        {  21, 159, 222 },

+        {   2, 107, 172 },

+        {   1,  72, 122 },

+        {   1,  40,  71 },

+        {   1,  18,  32 }

+      }, { /* Coeff Band 5 */

+        {  61, 199, 240 },

+        {  27, 161, 226 },

+        {   4, 113, 180 },

+        {   1,  76, 129 },

+        {   1,  46,  80 },

+        {   1,  23,  41 }

+      }

-  }, {

-    {

-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },

-      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },

-    }, {

-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },

-      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },

-    }

};

-static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]

-                                                  [REF_TYPES]

-                                                  [BLOCK_TYPES]

-                                                  [NZC16X16_TOKENS] = {

-  {

-    {

-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },

-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },

-    }, {

-      { 32988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },

-      { 92052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },

+static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {

+  { /* block Type 0 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        {   7,  27, 153 },

+        {   5,  30,  95 },

+        {   1,  16,  30 }

+      }, { /* Coeff Band 1 */

+        {  50,  75, 127 },

+        {  57,  75, 124 },

+        {  27,  67, 108 },

+        {  10,  54,  86 },

+        {   1,  33,  52 },

+        {   1,  12,  18 }

+      }, { /* Coeff Band 2 */

+        {  43, 125, 151 },

+        {  26, 108, 148 },

+        {   7,  83, 122 },

+        {   2,  59,  89 },

+        {   1,  38,  60 },

+        {   1,  17,  27 }

+      }, { /* Coeff Band 3 */

+        {  23, 144, 163 },

+        {  13, 112, 154 },

+        {   2,  75, 117 },

+        {   1,  50,  81 },

+        {   1,  31,  51 },

+        {   1,  14,  23 }

+      }, { /* Coeff Band 4 */

+        {  18, 162, 185 },

+        {   6, 123, 171 },

+        {   1,  78, 125 },

+        {   1,  51,  86 },

+        {   1,  31,  54 },

+        {   1,  14,  23 }

+      }, { /* Coeff Band 5 */

+        {  15, 199, 227 },

+        {   3, 150, 204 },

+        {   1,  91, 146 },

+        {   1,  55,  95 },

+        {   1,  30,  53 },

+        {   1,  11,  20 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        {  19,  55, 240 },

+        {  19,  59, 196 },

+        {   3,  52, 105 }

+      }, { /* Coeff Band 1 */

+        {  41, 166, 207 },

+        { 104, 153, 199 },

+        {  31, 123, 181 },

+        {  14, 101, 152 },

+        {   5,  72, 106 },

+        {   1,  36,  52 }

+      }, { /* Coeff Band 2 */

+        {  35, 176, 211 },

+        {  12, 131, 190 },

+        {   2,  88, 144 },

+        {   1,  60, 101 },

+        {   1,  36,  60 },

+        {   1,  16,  28 }

+      }, { /* Coeff Band 3 */

+        {  28, 183, 213 },

+        {   8, 134, 191 },

+        {   1,  86, 142 },

+        {   1,  56,  96 },

+        {   1,  30,  53 },

+        {   1,  12,  20 }

+      }, { /* Coeff Band 4 */

+        {  20, 190, 215 },

+        {   4, 135, 192 },

+        {   1,  84, 139 },

+        {   1,  53,  91 },

+        {   1,  28,  49 },

+        {   1,  11,  20 }

+      }, { /* Coeff Band 5 */

+        {  13, 196, 216 },

+        {   2, 137, 192 },

+        {   1,  86, 143 },

+        {   1,  57,  99 },

+        {   1,  32,  56 },

+        {   1,  13,  24 }

+      }

-  }, {

-    {

-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },

-      { 47772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },

-    }, {

-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },

-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },

+  }, { /* block Type 1 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 211,  29, 217 },

+        {  96,  47, 156 },

+        {  22,  43,  87 }

+      }, { /* Coeff Band 1 */

+        {  78, 120, 193 },

+        { 111, 116, 186 },

+        {  46, 102, 164 },

+        {  15,  80, 128 },

+        {   2,  49,  76 },

+        {   1,  18,  28 }

+      }, { /* Coeff Band 2 */

+        {  71, 161, 203 },

+        {  42, 132, 192 },

+        {  10,  98, 150 },

+        {   3,  69, 109 },

+        {   1,  44,  70 },

+        {   1,  18,  29 }

+      }, { /* Coeff Band 3 */

+        {  57, 186, 211 },

+        {  30, 140, 196 },

+        {   4,  93, 146 },

+        {   1,  62, 102 },

+        {   1,  38,  65 },

+        {   1,  16,  27 }

+      }, { /* Coeff Band 4 */

+        {  47, 199, 217 },

+        {  14, 145, 196 },

+        {   1,  88, 142 },

+        {   1,  57,  98 },

+        {   1,  36,  62 },

+        {   1,  15,  26 }

+      }, { /* Coeff Band 5 */

+        {  26, 219, 229 },

+        {   5, 155, 207 },

+        {   1,  94, 151 },

+        {   1,  60, 104 },

+        {   1,  36,  62 },

+        {   1,  16,  28 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 233,  29, 248 },

+        { 146,  47, 220 },

+        {  43,  52, 140 }

+      }, { /* Coeff Band 1 */

+        { 100, 163, 232 },

+        { 179, 161, 222 },

+        {  63, 142, 204 },

+        {  37, 113, 174 },

+        {  26,  89, 137 },

+        {  18,  68,  97 }

+      }, { /* Coeff Band 2 */

+        {  85, 181, 230 },

+        {  32, 146, 209 },

+        {   7, 100, 164 },

+        {   3,  71, 121 },

+        {   1,  45,  77 },

+        {   1,  18,  30 }

+      }, { /* Coeff Band 3 */

+        {  65, 187, 230 },

+        {  20, 148, 207 },

+        {   2,  97, 159 },

+        {   1,  68, 116 },

+        {   1,  40,  70 },

+        {   1,  14,  29 }

+      }, { /* Coeff Band 4 */

+        {  40, 194, 227 },

+        {   8, 147, 204 },

+        {   1,  94, 155 },

+        {   1,  65, 112 },

+        {   1,  39,  66 },

+        {   1,  14,  26 }

+      }, { /* Coeff Band 5 */

+        {  16, 208, 228 },

+        {   3, 151, 207 },

+        {   1,  98, 160 },

+        {   1,  67, 117 },

+        {   1,  41,  74 },

+        {   1,  17,  31 }

+      }

-  }, {

-    {

-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },

-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },

-    }, {

-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },

-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },

-    }

};

-static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]

-                                                  [REF_TYPES]

-                                                  [BLOCK_TYPES]

-                                                  [NZC32X32_TOKENS] = {

-  {

-    {

-      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },

-      { 52052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },

-    }, {

-      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },

-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },

+static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {

+  { /* block Type 0 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        {  17,  38, 140 },

+        {   7,  34,  80 },

+        {   1,  17,  29 }

+      }, { /* Coeff Band 1 */

+        {  37,  75, 128 },

+        {  41,  76, 128 },

+        {  26,  66, 116 },

+        {  12,  52,  94 },

+        {   2,  32,  55 },

+        {   1,  10,  16 }

+      }, { /* Coeff Band 2 */

+        {  50, 127, 154 },

+        {  37, 109, 152 },

+        {  16,  82, 121 },

+        {   5,  59,  85 },

+        {   1,  35,  54 },

+        {   1,  13,  20 }

+      }, { /* Coeff Band 3 */

+        {  40, 142, 167 },

+        {  17, 110, 157 },

+        {   2,  71, 112 },

+        {   1,  44,  72 },

+        {   1,  27,  45 },

+        {   1,  11,  17 }

+      }, { /* Coeff Band 4 */

+        {  30, 175, 188 },

+        {   9, 124, 169 },

+        {   1,  74, 116 },

+        {   1,  48,  78 },

+        {   1,  30,  49 },

+        {   1,  11,  18 }

+      }, { /* Coeff Band 5 */

+        {  10, 222, 223 },

+        {   2, 150, 194 },

+        {   1,  83, 128 },

+        {   1,  48,  79 },

+        {   1,  27,  45 },

+        {   1,  11,  17 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        {  36,  41, 235 },

+        {  29,  36, 193 },

+        {  10,  27, 111 }

+      }, { /* Coeff Band 1 */

+        {  85, 165, 222 },

+        { 177, 162, 215 },

+        { 110, 135, 195 },

+        {  57, 113, 168 },

+        {  23,  83, 120 },

+        {  10,  49,  61 }

+      }, { /* Coeff Band 2 */

+        {  85, 190, 223 },

+        {  36, 139, 200 },

+        {   5,  90, 146 },

+        {   1,  60, 103 },

+        {   1,  38,  65 },

+        {   1,  18,  30 }

+      }, { /* Coeff Band 3 */

+        {  72, 202, 223 },

+        {  23, 141, 199 },

+        {   2,  86, 140 },

+        {   1,  56,  97 },

+        {   1,  36,  61 },

+        {   1,  16,  27 }

+      }, { /* Coeff Band 4 */

+        {  55, 218, 225 },

+        {  13, 145, 200 },

+        {   1,  86, 141 },

+        {   1,  57,  99 },

+        {   1,  35,  61 },

+        {   1,  13,  22 }

+      }, { /* Coeff Band 5 */

+        {  15, 235, 212 },

+        {   1, 132, 184 },

+        {   1,  84, 139 },

+        {   1,  57,  97 },

+        {   1,  34,  56 },

+        {   1,  14,  23 }

+      }

-  }, {

-    {

-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },

-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },

-    }, {

-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },

-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },

+  }, { /* block Type 1 */

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 181,  21, 201 },

+        {  61,  37, 123 },

+        {  10,  38,  71 }

+      }, { /* Coeff Band 1 */

+        {  47, 106, 172 },

+        {  95, 104, 173 },

+        {  42,  93, 159 },

+        {  18,  77, 131 },

+        {   4,  50,  81 },

+        {   1,  17,  23 }

+      }, { /* Coeff Band 2 */

+        {  62, 147, 199 },

+        {  44, 130, 189 },

+        {  28, 102, 154 },

+        {  18,  75, 115 },

+        {   2,  44,  65 },

+        {   1,  12,  19 }

+      }, { /* Coeff Band 3 */

+        {  55, 153, 210 },

+        {  24, 130, 194 },

+        {   3,  93, 146 },

+        {   1,  61,  97 },

+        {   1,  31,  50 },

+        {   1,  10,  16 }

+      }, { /* Coeff Band 4 */

+        {  49, 186, 223 },

+        {  17, 148, 204 },

+        {   1,  96, 142 },

+        {   1,  53,  83 },

+        {   1,  26,  44 },

+        {   1,  11,  17 }

+      }, { /* Coeff Band 5 */

+        {  13, 217, 212 },

+        {   2, 136, 180 },

+        {   1,  78, 124 },

+        {   1,  50,  83 },

+        {   1,  29,  49 },

+        {   1,  14,  23 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 197,  13, 247 },

+        {  82,  17, 222 },

+        {  25,  17, 162 }

+      }, { /* Coeff Band 1 */

+        { 126, 186, 247 },

+        { 234, 191, 243 },

+        { 176, 177, 234 },

+        { 104, 158, 220 },

+        {  66, 128, 186 },

+        {  55,  90, 137 }

+      }, { /* Coeff Band 2 */

+        { 111, 197, 242 },

+        {  46, 158, 219 },

+        {   9, 104, 171 },

+        {   2,  65, 125 },

+        {   1,  44,  80 },

+        {   1,  17,  91 }

+      }, { /* Coeff Band 3 */

+        { 104, 208, 245 },

+        {  39, 168, 224 },

+        {   3, 109, 162 },

+        {   1,  79, 124 },

+        {   1,  50, 102 },

+        {   1,  43, 102 }

+      }, { /* Coeff Band 4 */

+        {  84, 220, 246 },

+        {  31, 177, 231 },

+        {   2, 115, 180 },

+        {   1,  79, 134 },

+        {   1,  55,  77 },

+        {   1,  60,  79 }

+      }, { /* Coeff Band 5 */

+        {  43, 243, 240 },

+        {   8, 180, 217 },

+        {   1, 115, 166 },

+        {   1,  84, 121 },

+        {   1,  51,  67 },

+        {   1,  16,   6 }

+      }

-  }, {

-    {

-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },

-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },

-    }, {

-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },

-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },

-    }

};

-#else

-static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]

-                                           [REF_TYPES]

-                                           [BLOCK_TYPES]

-                                           [NZC4X4_TOKENS] = {

-  {

-    {

-      { 219, 162, 179, 142, 242, },

-      { 214, 253, 228, 246, 255, },

-    }, {

-      { 225, 236, 190, 229, 253, },

-      { 251, 253, 240, 248, 255, },

-    },

-  }, {

-    {

-      { 106, 126, 158, 126, 244, },

-      { 118, 241, 201, 240, 255, },

-    }, {

-      { 165, 179, 143, 189, 242, },

-      { 173, 239, 192, 255, 128, },

-    },

-  }, {

-    {

-      { 42 , 78 , 153, 92 , 223, },

-      { 128, 128, 128, 128, 128, },

-    }, {

-      { 76 , 68 , 126, 110, 216, },

-      { 128, 128, 128, 128, 128, },

-    },

-  },

-};

-static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]

-                                           [REF_TYPES]

-                                           [BLOCK_TYPES]

-                                           [NZC8X8_TOKENS] = {

-  {

-    {

-      { 134, 139, 170, 178, 142, 197, 255, },

-      { 167, 224, 199, 252, 205, 255, 128, },

-    }, {

-      { 181, 210, 180, 241, 190, 235, 255, },

-      { 234, 251, 235, 252, 219, 255, 128, },

-    },

-  }, {

-    {

-      { 33 , 64 , 155, 143, 86 , 216, 255, },

-      { 73 , 160, 167, 251, 153, 255, 128, },

-    }, {

-      { 79 , 104, 153, 195, 119, 246, 255, },

-      { 149, 183, 186, 249, 203, 255, 128, },

-    },

-  }, {

-    {

-      { 10 , 25 , 156, 61 , 69 , 156, 254, },

-      { 32 , 1  , 128, 146, 64 , 255, 128, },

-    }, {

-      { 37 , 48 , 143, 113, 81 , 202, 255, },

-      { 1  , 255, 128, 128, 128, 128, 128, },

-    },

-  },

-};

-static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]

-                                             [REF_TYPES]

-                                             [BLOCK_TYPES]

-                                             [NZC16X16_TOKENS] = {

-  {

-    {

-      { 11 , 188, 210, 167, 141, 143, 152, 255, 128, },

-      { 171, 201, 203, 244, 207, 255, 255, 128, 128, },

-    }, {

-      { 23 , 217, 207, 251, 198, 255, 219, 128, 128, },

-      { 235, 249, 229, 255, 199, 128, 128, 128, 128, },

-    },

-  }, {

-    {

-      { 9  , 45 , 168, 85 , 66 , 221, 139, 246, 255, },

-      { 51 , 110, 163, 238, 94 , 255, 255, 128, 128, },

-    }, {

-      { 4  , 149, 175, 240, 149, 255, 205, 128, 128, },

-      { 141, 217, 186, 255, 128, 128, 128, 128, 128, },

-    },

-  }, {

-    {

-      { 1  , 12 , 173, 6  , 68 , 145, 41 , 204, 255, },

-      { 39 , 47 , 128, 199, 110, 255, 128, 128, 128, },

-    }, {

-      { 1  , 121, 171, 149, 115, 242, 159, 255, 128, },

-      { 1  , 255, 255, 128, 128, 128, 128, 128, 128, },

-    },

-  },

-};

-static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]

-                                             [REF_TYPES]

-                                             [BLOCK_TYPES]

-                                             [NZC32X32_TOKENS] = {

-  {

-    {

-      { 11 , 216, 195, 201, 160, 247, 217, 255, 255, 128, 128, },

-      { 177, 240, 239, 255, 192, 128, 128, 128, 128, 128, 128, },

-    }, {

-      { 48 , 235, 213, 235, 199, 255, 255, 128, 128, 128, 128, },

-      { 205, 255, 248, 128, 128, 128, 128, 128, 128, 128, 128, },

-    },

-  }, {

-    {

-      { 6  , 96 , 138, 99 , 125, 248, 188, 255, 128, 128, 128, },

-      { 17 , 53 , 43 , 189, 1  , 255, 171, 128, 128, 128, 128, },

-    }, {

-      { 5  , 187, 235, 232, 117, 255, 219, 128, 128, 128, 128, },

-      { 146, 255, 255, 128, 128, 128, 128, 128, 128, 128, 128, },

-    },

-  }, {

-    {

-      { 1  , 7  , 93 , 14 , 100, 30 , 85 , 65 , 81 , 210, 255, },

-      { 1  , 1  , 128, 26 , 1  , 218, 78 , 255, 255, 128, 128, },

-    }, {

-      { 4  , 148, 206, 137, 160, 255, 255, 128, 128, 128, 128, },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, },

-    },

-  },

-};

 #endif

-static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]

-                                            [NZC_TOKENS_EXTRA]

-                                            [NZC_BITS_EXTRA] = {

-  // Bit probabilities are in least to most significance order

-  {

-    {176, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4

-    {164, 192, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8

-    {154, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16

-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32

-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64

-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128

-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256

-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512

-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024

-  }, {

-    {168, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4

-    {152, 184, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8

-    {152, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16

-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32

-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64

-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128

-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256

-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512

-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024

-  }, {

-    {160, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4

-    {152, 176, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8

-    {150, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16

-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32

-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64

-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128

-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256

-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512

-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024

-  },

-};

-#endif  // CONFIG_CODE_NONZEROCOUNT

--- a/vp9/common/vp9_entropy.c

+++ b/vp9/common/vp9_entropy.c

@@ -8,11 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include <stdio.h>

 #include "vp9/common/vp9_entropy.h"

-#include "string.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_entropymode.h"

@@ -20,8 +16,6 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_coefupdateprobs.h"

-const int vp9_i8x8_block[4] = {0, 2, 8, 10};

 DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {

   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,

   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

@@ -41,22 +35,16 @@

   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

};

-// Unified coefficient band structure used by all block sizes

-DECLARE_ALIGNED(16, const int, vp9_coef_bands8x8[64]) = {

-  0, 1, 2, 3, 4, 4, 5, 5,

-  1, 2, 3, 4, 4, 5, 5, 5,

-  2, 3, 4, 4, 5, 5, 5, 5,

-  3, 4, 4, 5, 5, 5, 5, 5,

-  4, 4, 5, 5, 5, 5, 5, 5,

-  4, 5, 5, 5, 5, 5, 5, 5,

-  5, 5, 5, 5, 5, 5, 5, 5,

-  5, 5, 5, 5, 5, 5, 5, 5

+DECLARE_ALIGNED(16, const uint8_t,

+                vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {

+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,

+  4, 4, 4, 4, 4, 5

};

-DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = {

-  0, 1, 2, 3,

-  1, 2, 3, 4,

-  2, 3, 4, 5,

-  3, 4, 5, 5

+DECLARE_ALIGNED(16, const uint8_t,

+                vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {

+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,

+  5, 5, 5, 5, 5, 5

};

 DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {

@@ -63,8 +51,7 @@

   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5

};

-#if CONFIG_SCATTERSCAN

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {

+DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = {

   0,  4,  1,  5,

   8,  2, 12,  9,

   3,  6, 13, 10,

@@ -85,7 +72,7 @@

   13, 11, 14, 15,

};

-DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {

+DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {

   0,  8,  1, 16,  9,  2, 17, 24,

   10,  3, 18, 25, 32, 11,  4, 26,

   33, 19, 40, 12, 34, 27,  5, 41,

@@ -118,7 +105,7 @@

   60, 39, 61, 47, 54, 55, 62, 63,

};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {

+DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {

   0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,

   50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,

   98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,

@@ -175,218 +162,64 @@

   190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,

};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {

+DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {

   0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,

   225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,

   71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,

   262,  387,  448,  325,  356,   10,   73,  418,  231,  168,  449,  294,  388,  105,  419,  263,   42,  200,  357,  450,  137,  480,   74,  326,  232,   11,  389,  169,  295,  420,  106,  451,

   481,  358,  264,  327,  201,   43,  138,  512,  482,  390,  296,  233,  170,  421,   75,  452,  359,   12,  513,  265,  483,  328,  107,  202,  514,  544,  422,  391,  453,  139,   44,  234,

-  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  392,  203,  108,  546,  485,  576,  298,  235,  140,  361,  516,  330,  172,  547,   45,  424,  455,  267,  393,  577,

-  486,   77,  204,  517,  362,  548,  608,   14,  456,  299,  578,  109,  236,  425,  394,  487,  609,  331,  141,  579,  518,   46,  268,   15,  173,  549,  610,  640,  363,   78,  519,  488,

-  300,  205,   16,  457,  580,  426,  550,  395,  110,  237,  611,  641,  332,  672,  142,  642,  269,  458,   47,  581,  427,  489,  174,  364,  520,  612,  551,  673,   79,  206,  301,  643,

-  704,   17,  111,  490,  674,  238,  582,   48,  521,  613,  333,  396,  459,  143,  270,  552,  644,  705,  736,  365,   80,  675,  583,  175,  428,  706,  112,  302,  207,  614,  553,   49,

-  645,  522,  737,  397,  768,  144,  334,   18,  676,  491,  239,  615,  707,  584,   81,  460,  176,  271,  738,  429,  113,  800,  366,  208,  523,  708,  646,  554,  677,  769,   19,  145,

-  585,  739,  240,  303,   50,  461,  616,  398,  647,  335,  492,  177,   82,  770,  832,  555,  272,  430,  678,  209,  709,  114,  740,  801,  617,   51,  304,  679,  524,  367,  586,  241,

-  20,  146,  771,  864,   83,  802,  648,  493,  399,  273,  336,  710,  178,  462,  833,  587,  741,  115,  305,  711,  368,  525,  618,  803,  210,  896,  680,  834,  772,   52,  649,  147,

-  431,  494,  556,  242,  400,  865,  337,   21,  928,  179,  742,   84,  463,  274,  369,  804,  650,  557,  743,  960,  835,  619,  773,  306,  211,  526,  432,  992,  588,  712,  116,  243,

-  866,  495,  681,  558,  805,  589,  401,  897,   53,  338,  148,  682,  867,  464,  275,   22,  370,  433,  307,  620,  527,  836,  774,  651,  713,  744,   85,  180,  621,  465,  929,  775,

-  496,  898,  212,  339,  244,  402,  590,  117,  559,  714,  434,   23,  868,  930,  806,  683,  528,  652,  371,  961,  149,  837,   54,  899,  745,  276,  993,  497,  403,  622,  181,  776,

-  746,  529,  560,  435,   86,  684,  466,  308,  591,  653,  715,  807,  340,  869,  213,  962,  245,  838,  561,  931,  808,  592,  118,  498,  372,  623,  685,  994,  467,  654,  747,  900,

-  716,  277,  150,   55,   24,  404,  530,  839,  777,  655,  182,  963,  840,  686,  778,  309,  870,  341,   87,  499,  809,  624,  593,  436,  717,  932,  214,  246,  995,  718,  625,  373,

-  562,   25,  119,  901,  531,  468,  964,  748,  810,  278,  779,  500,  563,  656,  405,  687,  871,  872,  594,  151,  933,  749,  841,  310,  657,  626,  595,  437,  688,  183,  996,  965,

-  902,  811,  342,  750,  689,  719,  532,   56,  215,  469,  934,  374,  247,  720,  780,  564,  781,  842,  406,   26,  751,  903,  873,   57,  279,  627,  501,  658,  843,  997,  812,  904,

-  88,  813,  438,  752,  935,  936,  311,  596,  533,  690,  343,  966,  874,   89,  120,  470,  721,  875,  659,  782,  565,  998,  375,  844,  845,   27,  628,  967,  121,  905,  968,  152,

-  937,  814,  753,  502,  691,  783,  184,  153,  722,  407,   58,  815,  999,  660,  597,  723,  534,  906,  216,  439,  907,  248,  185,  876,  846,  692,  784,  629,   90,  969,  280,  754,

-  938,  939,  217,  847,  566,  471,  785,  816,  877, 1000,  249,  878,  661,  503,  312,  970,  755,  122,  817,  281,  344,  786,  598,  724,   28,   59,   29,  154,  535,  630,  376, 1001,

-  313,  908,  186,   91,  848,  849,  345,  909,  940,  879,  408,  818,  693, 1002,  971,  941,  567,  377,  218,  756,  910,  787,  440,  123,  880,  725,  662,  250,  819, 1003,  282,  972,

-  850,  599,  472,  409,  155,  441,  942,  757,  788,  694,  911,  881,  314,  631,  973,  504,  187, 1004,  346,  473,  851,  943,  820,  726,   60,  505,  219,  378,  912,  974,   30,   31,

-  536,  882, 1005,   92,  251,  663,  944,  913,  283,  695,  883,  568, 1006,  975,  410,  442,  945,  789,  852,  537, 1007,  124,  315,   61,  758,  821,  600,  914,  976,  569,  474,  347,

-  156, 1008,  915,   93,  977,  506,  946,  727,  379,  884,  188,  632,  601, 1009,  790,  853,  978,  947,  220,  411,  125,  633,  664,  759,  252,  443,  916,  538,  157,  822,   62,  570,

-  979,  284, 1010,  885,  948,  189,  475,   94,  316,  665,  696, 1011,  854,  791,  980,  221,  348,   63,  917,  602,  380,  507,  253,  126,  697,  823,  634,  285,  728,  949,  886,   95,

-  158,  539, 1012,  317,  412,  444,  760,  571,  190,  981,  729,  918,  127,  666,  349,  381,  476,  855,  761, 1013,  603,  222,  159,  698,  950,  508,  254,  792,  286,  635,  887,  793,

-  413,  191,  982,  445,  540,  318,  730,  667,  223,  824,  919, 1014,  350,  477,  572,  255,  825,  951,  762,  509,  604,  856,  382,  699,  287,  319,  636,  983,  794,  414,  541,  731,

-  857,  888,  351,  446,  573, 1015,  668,  889,  478,  826,  383,  763,  605,  920,  510,  637,  415,  700,  921,  858,  447,  952,  542,  795,  479,  953,  732,  890,  669,  574,  511,  984,

-  827,  985,  922, 1016,  764,  606,  543,  701,  859,  638, 1017,  575,  796,  954,  733,  891,  670,  607,  828,  986,  765,  923,  639, 1018,  702,  860,  955,  671,  892,  734,  797,  703,

-  987,  829, 1019,  766,  924,  735,  861,  956,  988,  893,  767,  798,  830, 1020,  925,  957,  799,  862,  831,  989,  894, 1021,  863,  926,  895,  958,  990, 1022,  927,  959,  991, 1023,

+  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  203,  108,  546,  485,  576,  298,  235,  140,  361,  330,  172,  547,   45,  455,  267,  577,  486,   77,  204,  362,

+  608,   14,  299,  578,  109,  236,  487,  609,  331,  141,  579,   46,   15,  173,  610,  363,   78,  205,   16,  110,  237,  611,  142,   47,  174,   79,  206,   17,  111,  238,   48,  143,

+  80,  175,  112,  207,   49,   18,  239,   81,  113,   19,   50,   82,  114,   51,   83,  115,  640,  516,  392,  268,  144,   20,  672,  641,  548,  517,  424,  393,  300,  269,  176,  145,

+  52,   21,  704,  673,  642,  580,  549,  518,  456,  425,  394,  332,  301,  270,  208,  177,  146,   84,   53,   22,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,  395,

+  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,  737,  706,  675,  613,  582,  551,  489,  458,  427,  365,  334,  303,  241,  210,  179,  117,   86,   55,  738,  707,

+  614,  583,  490,  459,  366,  335,  242,  211,  118,   87,  739,  615,  491,  367,  243,  119,  768,  644,  520,  396,  272,  148,   24,  800,  769,  676,  645,  552,  521,  428,  397,  304,

+  273,  180,  149,   56,   25,  832,  801,  770,  708,  677,  646,  584,  553,  522,  460,  429,  398,  336,  305,  274,  212,  181,  150,   88,   57,   26,  864,  833,  802,  771,  740,  709,

+  678,  647,  616,  585,  554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,  865,  834,  803,  741,  710,  679,  617,  586,  555,  493,

+  462,  431,  369,  338,  307,  245,  214,  183,  121,   90,   59,  866,  835,  742,  711,  618,  587,  494,  463,  370,  339,  246,  215,  122,   91,  867,  743,  619,  495,  371,  247,  123,

+  896,  772,  648,  524,  400,  276,  152,   28,  928,  897,  804,  773,  680,  649,  556,  525,  432,  401,  308,  277,  184,  153,   60,   29,  960,  929,  898,  836,  805,  774,  712,  681,

+  650,  588,  557,  526,  464,  433,  402,  340,  309,  278,  216,  185,  154,   92,   61,   30,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,  620,  589,  558,  527,

+  496,  465,  434,  403,  372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  993,  962,  931,  869,  838,  807,  745,  714,  683,  621,  590,  559,  497,  466,  435,  373,

+  342,  311,  249,  218,  187,  125,   94,   63,  994,  963,  870,  839,  746,  715,  622,  591,  498,  467,  374,  343,  250,  219,  126,   95,  995,  871,  747,  623,  499,  375,  251,  127,

+  900,  776,  652,  528,  404,  280,  156,  932,  901,  808,  777,  684,  653,  560,  529,  436,  405,  312,  281,  188,  157,  964,  933,  902,  840,  809,  778,  716,  685,  654,  592,  561,

+  530,  468,  437,  406,  344,  313,  282,  220,  189,  158,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,

+  314,  283,  252,  221,  190,  159,  997,  966,  935,  873,  842,  811,  749,  718,  687,  625,  594,  563,  501,  470,  439,  377,  346,  315,  253,  222,  191,  998,  967,  874,  843,  750,

+  719,  626,  595,  502,  471,  378,  347,  254,  223,  999,  875,  751,  627,  503,  379,  255,  904,  780,  656,  532,  408,  284,  936,  905,  812,  781,  688,  657,  564,  533,  440,  409,

+  316,  285,  968,  937,  906,  844,  813,  782,  720,  689,  658,  596,  565,  534,  472,  441,  410,  348,  317,  286, 1000,  969,  938,  907,  876,  845,  814,  783,  752,  721,  690,  659,

+  628,  597,  566,  535,  504,  473,  442,  411,  380,  349,  318,  287, 1001,  970,  939,  877,  846,  815,  753,  722,  691,  629,  598,  567,  505,  474,  443,  381,  350,  319, 1002,  971,

+  878,  847,  754,  723,  630,  599,  506,  475,  382,  351, 1003,  879,  755,  631,  507,  383,  908,  784,  660,  536,  412,  940,  909,  816,  785,  692,  661,  568,  537,  444,  413,  972,

+  941,  910,  848,  817,  786,  724,  693,  662,  600,  569,  538,  476,  445,  414, 1004,  973,  942,  911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,

+  446,  415, 1005,  974,  943,  881,  850,  819,  757,  726,  695,  633,  602,  571,  509,  478,  447, 1006,  975,  882,  851,  758,  727,  634,  603,  510,  479, 1007,  883,  759,  635,  511,

+  912,  788,  664,  540,  944,  913,  820,  789,  696,  665,  572,  541,  976,  945,  914,  852,  821,  790,  728,  697,  666,  604,  573,  542, 1008,  977,  946,  915,  884,  853,  822,  791,

+  760,  729,  698,  667,  636,  605,  574,  543, 1009,  978,  947,  885,  854,  823,  761,  730,  699,  637,  606,  575, 1010,  979,  886,  855,  762,  731,  638,  607, 1011,  887,  763,  639,

+  916,  792,  668,  948,  917,  824,  793,  700,  669,  980,  949,  918,  856,  825,  794,  732,  701,  670, 1012,  981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1013,  982,

+  951,  889,  858,  827,  765,  734,  703, 1014,  983,  890,  859,  766,  735, 1015,  891,  767,  920,  796,  952,  921,  828,  797,  984,  953,  922,  860,  829,  798, 1016,  985,  954,  923,

+  892,  861,  830,  799, 1017,  986,  955,  893,  862,  831, 1018,  987,  894,  863, 1019,  895,  924,  956,  925,  988,  957,  926, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,

};

-#else  // CONFIG_SCATTERSCAN

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {

-  0,  1,  4,  8,

-  5,  2,  3,  6,

-  9, 12, 13, 10,

-  7, 11, 14, 15,

-};

-DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {

-  0, 4,  8, 12,

-  1, 5,  9, 13,

-  2, 6, 10, 14,

-  3, 7, 11, 15

-};

-DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {

-  0,   1,  2,  3,

-  4,   5,  6,  7,

-  8,   9, 10, 11,

-  12, 13, 14, 15

-};

-DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {

-  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,

-  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,

-  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,

-  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,

-};

-DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {

-   0,  8, 16, 24, 32, 40, 48, 56,

-   1,  9, 17, 25, 33, 41, 49, 57,

-   2, 10, 18, 26, 34, 42, 50, 58,

-   3, 11, 19, 27, 35, 43, 51, 59,

-   4, 12, 20, 28, 36, 44, 52, 60,

-   5, 13, 21, 29, 37, 45, 53, 61,

-   6, 14, 22, 30, 38, 46, 54, 62,

-   7, 15, 23, 31, 39, 47, 55, 63,

-};

-DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {

-   0,  1,  2,  3,  4,  5,  6,  7,

-   8,  9, 10, 11, 12, 13, 14, 15,

-  16, 17, 18, 19, 20, 21, 22, 23,

-  24, 25, 26, 27, 28, 29, 30, 31,

-  32, 33, 34, 35, 36, 37, 38, 39,

-  40, 41, 42, 43, 44, 45, 46, 47,

-  48, 49, 50, 51, 52, 53, 54, 55,

-  56, 57, 58, 59, 60, 61, 62, 63,

-};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {

-  0,   1,  16,  32,  17,   2,   3,  18,

-  33,  48,  64,  49,  34,  19,   4,   5,

-  20,  35,  50,  65,  80,  96,  81,  66,

-  51,  36,  21,   6,   7,  22,  37,  52,

-  67,  82,  97, 112, 128, 113,  98,  83,

-  68,  53,  38,  23,   8,   9,  24,  39,

-  54,  69,  84,  99, 114, 129, 144, 160,

-  145, 130, 115, 100,  85,  70,  55,  40,

-  25,  10,  11,  26,  41,  56,  71,  86,

-  101, 116, 131, 146, 161, 176, 192, 177,

-  162, 147, 132, 117, 102,  87,  72,  57,

-  42,  27,  12,  13,  28,  43,  58, 73,

-  88, 103, 118, 133, 148, 163, 178, 193,

-  208, 224, 209, 194, 179, 164, 149, 134,

-  119, 104,  89,  74,  59,  44,  29,  14,

-  15,  30, 45,  60,  75,  90, 105, 120,

-  135, 150, 165, 180, 195, 210, 225, 240,

-  241, 226, 211, 196, 181, 166, 151, 136,

-  121, 106,  91,  76,  61,  46,  31,  47,

-  62,  77, 92, 107, 122, 137, 152, 167,

-  182, 197, 212, 227, 242, 243, 228, 213,

-  198, 183, 168, 153, 138, 123, 108, 93,

-  78,  63,  79,  94, 109, 124, 139, 154,

-  169, 184, 199, 214, 229, 244, 245, 230,

-  215, 200, 185, 170, 155, 140, 125, 110,

-  95, 111, 126, 141, 156, 171, 186, 201,

-  216, 231, 246, 247, 232, 217, 202, 187,

-  172, 157, 142, 127, 143, 158, 173, 188,

-  203, 218, 233, 248, 249, 234, 219, 204,

-  189, 174, 159, 175, 190, 205, 220, 235,

-  250, 251, 236, 221, 206, 191, 207, 222,

-  237, 252, 253, 238, 223, 239, 254, 255,

-};

-DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {

-    0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240,

-    1,  17,  33,  49,  65,  81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241,

-    2,  18,  34,  50,  66,  82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242,

-    3,  19,  35,  51,  67,  83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243,

-    4,  20,  36,  52,  68,  84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,

-    5,  21,  37,  53,  69,  85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,

-    6,  22,  38,  54,  70,  86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,

-    7,  23,  39,  55,  71,  87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,

-    8,  24,  40,  56,  72,  88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,

-    9,  25,  41,  57,  73,  89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,

-   10,  26,  42,  58,  74,  90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,

-   11,  27,  43,  59,  75,  91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,

-   12,  28,  44,  60,  76,  92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,

-   13,  29,  45,  61,  77,  93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,

-   14,  30,  46,  62,  78,  94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,

-   15,  31,  47,  63,  79,  95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,

-};

-DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {

-    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,

-   16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,

-   32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,

-   48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,

-   64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,

-   80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,

-   96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,

-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,

-  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,

-  144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,

-  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,

-  176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,

-  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,

-  208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,

-  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,

-  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,

-};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {

-    0,    1,   32,   64,   33,    2,    3,   34,   65,   96,  128,   97,   66,   35,    4,    5,   36,   67,   98,  129,  160,  192,  161,  130,   99,   68,   37,    6,    7,   38,   69,  100,

-  131,  162,  193,  224,  256,  225,  194,  163,  132,  101,   70,   39,    8,    9,   40,   71,  102,  133,  164,  195,  226,  257,  288,  320,  289,  258,  227,  196,  165,  134,  103,   72,

-   41,   10,   11,   42,   73,  104,  135,  166,  197,  228,  259,  290,  321,  352,  384,  353,  322,  291,  260,  229,  198,  167,  136,  105,   74,   43,   12,   13,   44,   75,  106,  137,

-  168,  199,  230,  261,  292,  323,  354,  385,  416,  448,  417,  386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,

-  263,  294,  325,  356,  387,  418,  449,  480,  512,  481,  450,  419,  388,  357,  326,  295,  264,  233,  202,  171,  140,  109,   78,   47,   16,   17,   48,   79,  110,  141,  172,  203,

-  234,  265,  296,  327,  358,  389,  420,  451,  482,  513,  544,  576,  545,  514,  483,  452,  421,  390,  359,  328,  297,  266,  235,  204,  173,  142,  111,   80,   49,   18,   19,   50,

-   81,  112,  143,  174,  205,  236,  267,  298,  329,  360,  391,  422,  453,  484,  515,  546,  577,  608,  640,  609,  578,  547,  516,  485,  454,  423,  392,  361,  330,  299,  268,  237,

-  206,  175,  144,  113,   82,   51,   20,   21,   52,   83,  114,  145,  176,  207,  238,  269,  300,  331,  362,  393,  424,  455,  486,  517,  548,  579,  610,  641,  672,  704,  673,  642,

-  611,  580,  549,  518,  487,  456,  425,  394,  363,  332,  301,  270,  239,  208,  177,  146,  115,   84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,

-  395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,  768,  737,  706,  675,  644,  613,  582,  551,  520,  489,  458,  427,  396,  365,  334,  303,  272,  241,  210,  179,

-  148,  117,   86,   55,   24,   25,   56,   87,  118,  149,  180,  211,  242,  273,  304,  335,  366,  397,  428,  459,  490,  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,  832,

-  801,  770,  739,  708,  677,  646,  615,  584,  553,  522,  491,  460,  429,  398,  367,  336,  305,  274,  243,  212,  181,  150,  119,   88,   57,   26,   27,   58,   89,  120,  151,  182,

-  213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,  896,  865,  834,  803,  772,  741,  710,  679,  648,  617,

-  586,  555,  524,  493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,  152,  121,   90,   59,   28,   29,   60,   91,  122,  153,  184,  215,  246,  277,  308,  339,  370,  401,

-  432,  463,  494,  525,  556,  587,  618,  649,  680,  711,  742,  773,  804,  835,  866,  897,  928,  960,  929,  898,  867,  836,  805,  774,  743,  712,  681,  650,  619,  588,  557,  526,

-  495,  464,  433,  402,  371,  340,  309,  278,  247,  216,  185,  154,  123,   92,   61,   30,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,

-  527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,  993,  962,  931,  900,  869,  838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,

-  497,  466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,   63,   95,  126,  157,  188,  219,  250,  281,  312,  343,  374,  405,  436,  467,  498,  529,  560,  591,

-  622,  653,  684,  715,  746,  777,  808,  839,  870,  901,  932,  963,  994,  995,  964,  933,  902,  871,  840,  809,  778,  747,  716,  685,  654,  623,  592,  561,  530,  499,  468,  437,

-  406,  375,  344,  313,  282,  251,  220,  189,  158,  127,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,

-  841,  872,  903,  934,  965,  996,  997,  966,  935,  904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,  532,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,

-  191,  223,  254,  285,  316,  347,  378,  409,  440,  471,  502,  533,  564,  595,  626,  657,  688,  719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  999,  968,  937,  906,  875,

-  844,  813,  782,  751,  720,  689,  658,  627,  596,  565,  534,  503,  472,  441,  410,  379,  348,  317,  286,  255,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,

-  659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000, 1001,  970,  939,  908,  877,  846,  815,  784,  753,  722,  691,  660,  629,  598,  567,  536,  505,  474,  443,  412,

-  381,  350,  319,  351,  382,  413,  444,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,  816,  847,  878,  909,  940,  971, 1002, 1003,  972,  941,  910,  879,  848,  817,

-  786,  755,  724,  693,  662,  631,  600,  569,  538,  507,  476,  445,  414,  383,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,

-  973, 1004, 1005,  974,  943,  912,  881,  850,  819,  788,  757,  726,  695,  664,  633,  602,  571,  540,  509,  478,  447,  479,  510,  541,  572,  603,  634,  665,  696,  727,  758,  789,

-  820,  851,  882,  913,  944,  975, 1006, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,  666,  635,  604,  573,  542,  511,  543,  574,  605,  636,  667,  698,  729,  760,

-  791,  822,  853,  884,  915,  946,  977, 1008, 1009,  978,  947,  916,  885,  854,  823,  792,  761,  730,  699,  668,  637,  606,  575,  607,  638,  669,  700,  731,  762,  793,  824,  855,

-  886,  917,  948,  979, 1010, 1011,  980,  949,  918,  887,  856,  825,  794,  763,  732,  701,  670,  639,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012, 1013,  982,

-  951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,

-  923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,

-};

-#endif  // CONFIG_SCATTERSCAN

 /* Array indices are identical to previously-existing CONTEXT_NODE indices */

 const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */

-  -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */

-  -ZERO_TOKEN, 4,                               /* 1 = ZERO */

-  -ONE_TOKEN, 6,                               /* 2 = ONE */

+#if CONFIG_BALANCED_COEFTREE

+  -ZERO_TOKEN, 2,                             /* 0 = ZERO */

+  -DCT_EOB_TOKEN, 4,                          /* 1 = EOB  */

+#else

+  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */

+  -ZERO_TOKEN, 4,                             /* 1 = ZERO */

+#endif

+  -ONE_TOKEN, 6,                              /* 2 = ONE */

   8, 12,                                      /* 3 = LOW_VAL */

   -TWO_TOKEN, 10,                            /* 4 = TWO */

   -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */

-  14, 16,                                    /* 6 = HIGH_LOW */

+  14, 16,                                   /* 6 = HIGH_LOW */

   -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */

   18, 20,                                   /* 8 = CAT_THREEFOUR */

-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */

-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */

+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */

+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */

};

-struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];

+struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];

 /* Trees for extra bits.  Probabilities are constant and

    do not depend on previously encoded bits */

@@ -400,1660 +233,189 @@

   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129

};

-#if CONFIG_CODE_NONZEROCOUNT

-const vp9_tree_index vp9_nzc4x4_tree[2 * NZC4X4_NODES] = {

-  -NZC_0, 2,

-  4, 6,

-  -NZC_1, -NZC_2,

-  -NZC_3TO4, 8,

-  -NZC_5TO8, -NZC_9TO16,

+const vp9_tree_index vp9_coefmodel_tree[6] = {

+#if CONFIG_BALANCED_COEFTREE

+  -ZERO_TOKEN, 2,

+  -DCT_EOB_MODEL_TOKEN, 4,

+#else

+  -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */

+  -ZERO_TOKEN, 4,                               /* 1 = ZERO */

+#endif

+  -ONE_TOKEN, -TWO_TOKEN,

};

-struct vp9_token_struct vp9_nzc4x4_encodings[NZC4X4_TOKENS];

-const vp9_tree_index vp9_nzc8x8_tree[2 * NZC8X8_NODES] = {

-  -NZC_0, 2,

-  4, 6,

-  -NZC_1, -NZC_2,

-  8, 10,

-  -NZC_3TO4, -NZC_5TO8,

-  -NZC_9TO16, 12,

-  -NZC_17TO32, -NZC_33TO64,

-};

-struct vp9_token_struct vp9_nzc8x8_encodings[NZC8X8_TOKENS];

+// Model obtained from a 2-sided zero-centerd distribuition derived

+// from a Pareto distribution. The cdf of the distribution is:

+// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]

+//

+// For a given beta and a given probablity of the 1-node, the alpha

+// is first solved, and then the {alpha, beta} pair is used to generate

+// the probabilities for the rest of the nodes.

-const vp9_tree_index vp9_nzc16x16_tree[2 * NZC16X16_NODES] = {

-  -NZC_0, 2,

-  4, 6,

-  -NZC_1, -NZC_2,

-  8, 10,

-  -NZC_3TO4, -NZC_5TO8,

-  12, 14,

-  -NZC_9TO16, -NZC_17TO32,

-  -NZC_33TO64, 16,

-  -NZC_65TO128, -NZC_129TO256,

-};

-struct vp9_token_struct vp9_nzc16x16_encodings[NZC16X16_TOKENS];

-const vp9_tree_index vp9_nzc32x32_tree[2 * NZC32X32_NODES] = {

-  -NZC_0, 2,

-  4, 6,

-  -NZC_1, -NZC_2,

-  8, 10,

-  -NZC_3TO4, -NZC_5TO8,

-  12, 14,

-  -NZC_9TO16, -NZC_17TO32,

-  16, 18,

-  -NZC_33TO64, -NZC_65TO128,

-  -NZC_129TO256, 20,

-  -NZC_257TO512, -NZC_513TO1024,

-};

-struct vp9_token_struct vp9_nzc32x32_encodings[NZC32X32_TOKENS];

-const int vp9_extranzcbits[NZC32X32_TOKENS] = {

-  0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9

-};

-const int vp9_basenzcvalue[NZC32X32_TOKENS] = {

-  0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513

-};

-#endif  // CONFIG_CODE_NONZEROCOUNT

-#if CONFIG_MODELCOEFPROB

-const vp9_prob vp9_modelcoefprobs_gg875[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

-  // Probs generated with a Generalized Gaussian (with shape parameter 0.875)

-  // source model with varying quantizer step size for a uniform quantizer

-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

-  {1,   2,   6,  86, 129,  11,  87,  42,  92,  52,},

-  {2,   4,  12,  87, 129,  22,  89,  75,  97,  91,},

-  {3,   6,  17,  88, 130,  32,  90, 102, 102, 121,},

-  {4,   8,  22,  89, 131,  41,  91, 125, 107, 145,},

-  {5,  10,  28,  90, 131,  50,  93, 144, 112, 164,},

-  {6,  12,  33,  90, 132,  59,  94, 160, 117, 180,},

-  {7,  14,  38,  91, 132,  67,  95, 173, 122, 193,},

-  {8,  15,  42,  92, 133,  75,  97, 185, 126, 204,},

-  {9,  17,  47,  92, 133,  82,  98, 195, 131, 212,},

-  {10,  19,  52,  93, 134,  89,  99, 203, 135, 220,},

-  {11,  21,  56,  94, 134,  96, 101, 211, 140, 226,},

-  {12,  23,  60,  95, 135, 102, 102, 217, 144, 231,},

-  {13,  25,  65,  95, 135, 109, 103, 222, 148, 235,},

-  {14,  26,  69,  96, 136, 115, 105, 227, 153, 238,},

-  {15,  28,  73,  97, 136, 120, 106, 231, 157, 241,},

-  {16,  30,  77,  97, 137, 126, 107, 234, 161, 244,},

-  {17,  32,  81,  98, 138, 131, 108, 237, 164, 246,},

-  {18,  34,  85,  99, 138, 136, 110, 240, 168, 247,},

-  {19,  35,  89, 100, 139, 141, 111, 242, 172, 249,},

-  {20,  37,  92, 100, 139, 145, 112, 244, 175, 250,},

-  {21,  39,  96, 101, 140, 150, 113, 246, 179, 251,},

-  {22,  41,  99, 102, 140, 154, 115, 247, 182, 252,},

-  {23,  42, 103, 102, 141, 158, 116, 248, 185, 252,},

-  {24,  44, 106, 103, 141, 162, 117, 249, 188, 253,},

-  {25,  46, 110, 104, 142, 166, 118, 250, 191, 253,},

-  {26,  48, 113, 104, 142, 170, 120, 251, 194, 254,},

-  {27,  49, 116, 105, 143, 173, 121, 252, 197, 254,},

-  {28,  51, 119, 106, 143, 176, 122, 252, 200, 254,},

-  {29,  53, 122, 107, 144, 180, 123, 253, 202, 255,},

-  {30,  54, 125, 107, 144, 183, 125, 253, 205, 255,},

-  {31,  56, 128, 108, 145, 186, 126, 254, 207, 255,},

-  {32,  58, 131, 109, 145, 189, 127, 254, 209, 255,},

-  {33,  59, 134, 109, 146, 191, 128, 254, 212, 255,},

-  {34,  61, 137, 110, 146, 194, 130, 254, 214, 255,},

-  {35,  62, 139, 111, 147, 196, 131, 255, 216, 255,},

-  {36,  64, 142, 112, 147, 199, 132, 255, 218, 255,},

-  {37,  66, 145, 112, 148, 201, 134, 255, 220, 255,},

-  {38,  67, 147, 113, 148, 203, 135, 255, 221, 255,},

-  {39,  69, 150, 114, 149, 206, 136, 255, 223, 255,},

-  {40,  70, 152, 114, 149, 208, 137, 255, 225, 255,},

-  {41,  72, 155, 115, 150, 210, 138, 255, 226, 255,},

-  {42,  74, 157, 116, 150, 212, 140, 255, 228, 255,},

-  {43,  75, 159, 117, 151, 213, 141, 255, 229, 255,},

-  {44,  77, 161, 117, 151, 215, 142, 255, 230, 255,},

-  {45,  78, 164, 118, 152, 217, 143, 255, 232, 255,},

-  {46,  80, 166, 119, 152, 219, 145, 255, 233, 255,},

-  {47,  81, 168, 120, 153, 220, 146, 255, 234, 255,},

-  {48,  83, 170, 120, 153, 222, 147, 255, 235, 255,},

-  {49,  84, 172, 121, 154, 223, 148, 255, 236, 255,},

-  {50,  86, 174, 122, 154, 225, 150, 255, 237, 255,},

-  {51,  87, 176, 123, 155, 226, 151, 255, 238, 255,},

-  {52,  89, 178, 123, 155, 227, 152, 255, 239, 255,},

-  {53,  90, 180, 124, 156, 228, 153, 255, 240, 255,},

-  {54,  92, 182, 125, 156, 230, 154, 255, 241, 255,},

-  {55,  93, 183, 126, 157, 231, 156, 255, 242, 255,},

-  {56,  95, 185, 126, 157, 232, 157, 255, 242, 255,},

-  {57,  96, 187, 127, 158, 233, 158, 255, 243, 255,},

-  {58,  98, 189, 128, 158, 234, 159, 255, 244, 255,},

-  {59,  99, 190, 129, 159, 235, 160, 255, 244, 255,},

-  {60, 101, 192, 129, 159, 236, 162, 255, 245, 255,},

-  {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,},

-  {62, 104, 195, 131, 160, 238, 164, 255, 246, 255,},

-  {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,},

-  {64, 106, 198, 132, 162, 239, 166, 255, 247, 255,},

-  {65, 108, 199, 133, 162, 240, 167, 255, 248, 255,},

-  {66, 109, 201, 134, 163, 241, 169, 255, 248, 255,},

-  {67, 111, 202, 135, 163, 241, 170, 255, 249, 255,},

-  {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,},

-  {69, 113, 205, 136, 164, 243, 172, 255, 249, 255,},

-  {70, 115, 206, 137, 165, 243, 173, 255, 250, 255,},

-  {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,},

-  {72, 117, 209, 138, 166, 244, 175, 255, 250, 255,},

-  {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,},

-  {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,},

-  {75, 121, 212, 141, 167, 246, 179, 255, 251, 255,},

-  {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,},

-  {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,},

-  {78, 125, 216, 143, 169, 247, 182, 255, 252, 255,},

-  {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,},

-  {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,},

-  {81, 129, 219, 146, 171, 248, 185, 255, 253, 255,},

-  {82, 131, 220, 146, 171, 249, 186, 255, 253, 255,},

-  {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,},

-  {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,},

-  {85, 134, 223, 149, 173, 250, 189, 255, 253, 255,},

-  {86, 136, 224, 149, 173, 250, 190, 255, 254, 255,},

-  {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,},

-  {88, 138, 226, 151, 174, 251, 192, 255, 254, 255,},

-  {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,},

-  {90, 141, 227, 153, 175, 251, 194, 255, 254, 255,},

-  {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,},

-  {92, 143, 229, 154, 177, 252, 196, 255, 254, 255,},

-  {93, 144, 230, 155, 177, 252, 197, 255, 254, 255,},

-  {94, 146, 230, 156, 178, 252, 198, 255, 255, 255,},

-  {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,},

-  {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,},

-  {97, 149, 233, 158, 179, 253, 201, 255, 255, 255,},

-  {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,},

-  {99, 152, 234, 160, 180, 253, 203, 255, 255, 255,},

-  {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,},

-  {101, 154, 235, 161, 182, 253, 205, 255, 255, 255,},

-  {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,},

-  {103, 156, 236, 163, 183, 254, 207, 255, 255, 255,},

-  {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,},

-  {105, 159, 238, 165, 184, 254, 208, 255, 255, 255,},

-  {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,},

-  {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,},

-  {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,},

-  {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,},

-  {110, 164, 240, 169, 187, 254, 212, 255, 255, 255,},

-  {111, 165, 241, 170, 187, 254, 213, 255, 255, 255,},

-  {112, 166, 241, 170, 188, 255, 214, 255, 255, 255,},

-  {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,},

-  {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,},

-  {115, 170, 243, 173, 189, 255, 216, 255, 255, 255,},

-  {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,},

-  {117, 172, 244, 174, 190, 255, 218, 255, 255, 255,},

-  {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,},

-  {119, 174, 244, 176, 192, 255, 219, 255, 255, 255,},

-  {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,},

-  {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,},

-  {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,},

-  {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,},

-  {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,},

-  {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,},

-  {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,},

-  {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,},

-  {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,},

-  {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,},

-  {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,},

-  {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,},

-  {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,},

-  {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,},

-  {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,},

-  {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,},

-  {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,},

-  {137, 192, 250, 190, 202, 255, 231, 255, 255, 255,},

-  {138, 193, 250, 191, 202, 255, 232, 255, 255, 255,},

-  {139, 194, 250, 192, 203, 255, 232, 255, 255, 255,},

-  {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,},

-  {141, 195, 251, 194, 204, 255, 234, 255, 255, 255,},

-  {142, 196, 251, 194, 205, 255, 234, 255, 255, 255,},

-  {143, 197, 251, 195, 205, 255, 235, 255, 255, 255,},

-  {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,},

-  {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,},

-  {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,},

-  {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,},

-  {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,},

-  {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,},

-  {150, 203, 252, 201, 209, 255, 238, 255, 255, 255,},

-  {151, 204, 253, 201, 210, 255, 239, 255, 255, 255,},

-  {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,},

-  {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,},

-  {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,},

-  {155, 208, 253, 204, 212, 255, 240, 255, 255, 255,},

-  {156, 209, 253, 205, 213, 255, 241, 255, 255, 255,},

-  {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,},

-  {158, 210, 254, 207, 214, 255, 242, 255, 255, 255,},

-  {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,},

-  {160, 212, 254, 208, 215, 255, 242, 255, 255, 255,},

-  {161, 213, 254, 209, 215, 255, 243, 255, 255, 255,},

-  {162, 213, 254, 210, 216, 255, 243, 255, 255, 255,},

-  {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,},

-  {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,},

-  {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,},

-  {166, 216, 254, 212, 218, 255, 245, 255, 255, 255,},

-  {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,},

-  {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,},

-  {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,},

-  {170, 219, 255, 215, 221, 255, 246, 255, 255, 255,},

-  {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,},

-  {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,},

-  {173, 222, 255, 217, 222, 255, 247, 255, 255, 255,},

-  {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,},

-  {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,},

-  {176, 224, 255, 220, 224, 255, 248, 255, 255, 255,},

-  {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,},

-  {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,},

-  {179, 226, 255, 222, 225, 255, 249, 255, 255, 255,},

-  {180, 226, 255, 222, 226, 255, 249, 255, 255, 255,},

-  {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,},

-  {182, 228, 255, 224, 227, 255, 249, 255, 255, 255,},

-  {183, 228, 255, 224, 228, 255, 250, 255, 255, 255,},

-  {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,},

-  {185, 230, 255, 226, 229, 255, 250, 255, 255, 255,},

-  {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,},

-  {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,},

-  {188, 232, 255, 228, 230, 255, 251, 255, 255, 255,},

-  {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,},

-  {190, 233, 255, 229, 231, 255, 251, 255, 255, 255,},

-  {191, 233, 255, 229, 232, 255, 251, 255, 255, 255,},

-  {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,},

-  {193, 234, 255, 231, 233, 255, 252, 255, 255, 255,},

-  {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,},

-  {195, 236, 255, 232, 234, 255, 252, 255, 255, 255,},

-  {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,},

-  {197, 237, 255, 233, 235, 255, 252, 255, 255, 255,},

-  {198, 237, 255, 234, 235, 255, 253, 255, 255, 255,},

-  {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,},

-  {200, 238, 255, 235, 236, 255, 253, 255, 255, 255,},

-  {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,},

-  {202, 239, 255, 236, 237, 255, 253, 255, 255, 255,},

-  {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,},

-  {204, 240, 255, 237, 238, 255, 254, 255, 255, 255,},

-  {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,},

-  {206, 241, 255, 238, 239, 255, 254, 255, 255, 255,},

-  {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,},

-  {208, 242, 255, 239, 240, 255, 254, 255, 255, 255,},

-  {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,},

-  {210, 243, 255, 240, 241, 255, 254, 255, 255, 255,},

-  {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,},

-  {212, 244, 255, 241, 242, 255, 254, 255, 255, 255,},

-  {213, 245, 255, 242, 243, 255, 255, 255, 255, 255,},

-  {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,},

-  {215, 246, 255, 243, 244, 255, 255, 255, 255, 255,},

-  {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,},

-  {217, 246, 255, 244, 244, 255, 255, 255, 255, 255,},

-  {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,},

-  {219, 247, 255, 245, 245, 255, 255, 255, 255, 255,},

-  {220, 248, 255, 245, 246, 255, 255, 255, 255, 255,},

-  {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,},

-  {222, 248, 255, 246, 247, 255, 255, 255, 255, 255,},

-  {223, 249, 255, 247, 247, 255, 255, 255, 255, 255,},

-  {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,},

-  {225, 250, 255, 247, 248, 255, 255, 255, 255, 255,},

-  {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,},

-  {227, 250, 255, 248, 249, 255, 255, 255, 255, 255,},

-  {228, 251, 255, 249, 249, 255, 255, 255, 255, 255,},

-  {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,},

-  {230, 251, 255, 249, 250, 255, 255, 255, 255, 255,},

-  {231, 251, 255, 250, 250, 255, 255, 255, 255, 255,},

-  {232, 252, 255, 250, 250, 255, 255, 255, 255, 255,},

-  {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,},

-  {234, 252, 255, 251, 251, 255, 255, 255, 255, 255,},

-  {235, 253, 255, 251, 251, 255, 255, 255, 255, 255,},

-  {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {237, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {238, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {239, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

-  {240, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

-  {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

-  {242, 254, 255, 253, 254, 255, 255, 255, 255, 255,},

-  {243, 254, 255, 254, 254, 255, 255, 255, 255, 255,},

-  {244, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

-  {245, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

-  {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

-  {247, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {248, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {249, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {250, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {251, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {252, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-};

-const vp9_prob vp9_modelcoefprobs_gg75[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

-  // Probs generated with a Generalized Gaussian (with shape parameter 0.75)

-  // source model with varying quantizer step size for a uniform quantizer

-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

-  {1,   2,   6,  87, 129,  11,  88,  39,  93,  47,},

-  {2,   4,  11,  88, 130,  21,  89,  68,  98,  79,},

-  {3,   6,  16,  89, 131,  30,  91,  92, 103, 105,},

-  {4,   8,  21,  90, 131,  38,  92, 112, 107, 126,},

-  {5,  10,  26,  90, 132,  46,  94, 129, 111, 143,},

-  {6,  11,  31,  91, 133,  54,  95, 143, 115, 157,},

-  {7,  13,  35,  92, 133,  61,  96, 156, 119, 170,},

-  {8,  15,  40,  93, 134,  68,  97, 167, 123, 180,},

-  {9,  17,  44,  94, 134,  74,  98, 177, 126, 189,},

-  {10,  19,  48,  94, 135,  80, 100, 185, 130, 197,},

-  {11,  20,  52,  95, 135,  86, 101, 192, 133, 204,},

-  {12,  22,  56,  96, 136,  92, 102, 199, 137, 210,},

-  {13,  24,  60,  96, 136,  97, 103, 205, 140, 215,},

-  {14,  26,  64,  97, 137, 103, 104, 210, 143, 219,},

-  {15,  27,  68,  98, 137, 108, 105, 215, 146, 223,},

-  {16,  29,  71,  98, 138, 112, 106, 219, 149, 227,},

-  {17,  31,  75,  99, 138, 117, 107, 223, 152, 230,},

-  {18,  32,  78, 100, 139, 121, 108, 226, 155, 233,},

-  {19,  34,  82, 100, 139, 126, 109, 229, 158, 235,},

-  {20,  36,  85, 101, 140, 130, 110, 231, 161, 238,},

-  {21,  37,  88, 102, 140, 134, 111, 234, 164, 239,},

-  {22,  39,  91, 102, 141, 138, 112, 236, 167, 241,},

-  {23,  40,  94, 103, 141, 141, 113, 238, 169, 243,},

-  {24,  42,  97, 104, 142, 145, 114, 240, 172, 244,},

-  {25,  44, 100, 104, 142, 149, 115, 241, 174, 245,},

-  {26,  45, 103, 105, 143, 152, 116, 243, 177, 246,},

-  {27,  47, 106, 105, 143, 155, 117, 244, 179, 247,},

-  {28,  48, 109, 106, 143, 158, 118, 245, 182, 248,},

-  {29,  50, 112, 107, 144, 161, 119, 246, 184, 249,},

-  {30,  52, 115, 107, 144, 164, 120, 247, 186, 250,},

-  {31,  53, 117, 108, 145, 167, 121, 248, 188, 250,},

-  {32,  55, 120, 109, 145, 170, 122, 249, 190, 251,},

-  {33,  56, 122, 109, 146, 173, 123, 249, 192, 252,},

-  {34,  58, 125, 110, 146, 175, 124, 250, 194, 252,},

-  {35,  59, 127, 110, 147, 178, 125, 251, 196, 252,},

-  {36,  61, 130, 111, 147, 180, 126, 251, 198, 253,},

-  {37,  62, 132, 112, 147, 183, 127, 251, 200, 253,},

-  {38,  64, 135, 112, 148, 185, 128, 252, 202, 253,},

-  {39,  65, 137, 113, 148, 187, 129, 252, 204, 254,},

-  {40,  67, 139, 114, 149, 189, 130, 253, 205, 254,},

-  {41,  68, 141, 114, 149, 191, 131, 253, 207, 254,},

-  {42,  70, 144, 115, 150, 193, 132, 253, 209, 254,},

-  {43,  71, 146, 115, 150, 195, 133, 254, 210, 254,},

-  {44,  72, 148, 116, 151, 197, 134, 254, 212, 255,},

-  {45,  74, 150, 117, 151, 199, 135, 254, 213, 255,},

-  {46,  75, 152, 117, 151, 201, 136, 254, 215, 255,},

-  {47,  77, 154, 118, 152, 202, 137, 254, 216, 255,},

-  {48,  78, 156, 119, 152, 204, 138, 254, 217, 255,},

-  {49,  80, 158, 119, 153, 206, 139, 255, 219, 255,},

-  {50,  81, 160, 120, 153, 207, 140, 255, 220, 255,},

-  {51,  82, 162, 120, 154, 209, 141, 255, 221, 255,},

-  {52,  84, 164, 121, 154, 210, 142, 255, 222, 255,},

-  {53,  85, 165, 122, 155, 212, 143, 255, 224, 255,},

-  {54,  87, 167, 122, 155, 213, 144, 255, 225, 255,},

-  {55,  88, 169, 123, 155, 215, 145, 255, 226, 255,},

-  {56,  89, 171, 124, 156, 216, 146, 255, 227, 255,},

-  {57,  91, 172, 124, 156, 217, 146, 255, 228, 255,},

-  {58,  92, 174, 125, 157, 218, 147, 255, 229, 255,},

-  {59,  93, 176, 126, 157, 220, 148, 255, 230, 255,},

-  {60,  95, 177, 126, 158, 221, 149, 255, 231, 255,},

-  {61,  96, 179, 127, 158, 222, 150, 255, 232, 255,},

-  {62,  97, 180, 127, 159, 223, 151, 255, 232, 255,},

-  {63,  99, 182, 128, 159, 224, 152, 255, 233, 255,},

-  {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,},

-  {65, 101, 185, 129, 160, 226, 154, 255, 235, 255,},

-  {66, 103, 186, 130, 160, 227, 155, 255, 236, 255,},

-  {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,},

-  {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,},

-  {69, 106, 190, 132, 162, 230, 158, 255, 238, 255,},

-  {70, 108, 192, 133, 162, 231, 159, 255, 238, 255,},

-  {71, 109, 193, 133, 162, 231, 159, 255, 239, 255,},

-  {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,},

-  {73, 111, 196, 134, 163, 233, 161, 255, 240, 255,},

-  {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,},

-  {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,},

-  {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,},

-  {77, 116, 200, 137, 165, 236, 165, 255, 243, 255,},

-  {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,},

-  {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,},

-  {80, 120, 204, 139, 167, 238, 168, 255, 244, 255,},

-  {81, 121, 205, 140, 167, 239, 168, 255, 244, 255,},

-  {82, 123, 206, 140, 167, 239, 169, 255, 245, 255,},

-  {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,},

-  {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,},

-  {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,},

-  {86, 127, 210, 143, 169, 241, 173, 255, 247, 255,},

-  {87, 129, 211, 144, 170, 242, 174, 255, 247, 255,},

-  {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,},

-  {89, 131, 213, 145, 171, 243, 175, 255, 248, 255,},

-  {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,},

-  {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,},

-  {92, 134, 216, 147, 172, 244, 178, 255, 249, 255,},

-  {93, 136, 217, 148, 172, 245, 179, 255, 249, 255,},

-  {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,},

-  {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,},

-  {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,},

-  {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,},

-  {98, 141, 221, 151, 175, 247, 183, 255, 250, 255,},

-  {99, 142, 222, 152, 175, 247, 184, 255, 250, 255,},

-  {100, 144, 223, 152, 176, 247, 185, 255, 251, 255,},

-  {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,},

-  {102, 146, 224, 154, 177, 248, 186, 255, 251, 255,},

-  {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,},

-  {104, 148, 226, 155, 177, 248, 188, 255, 252, 255,},

-  {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,},

-  {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,},

-  {107, 151, 228, 157, 179, 249, 191, 255, 252, 255,},

-  {108, 152, 229, 158, 179, 250, 191, 255, 252, 255,},

-  {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,},

-  {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,},

-  {111, 155, 231, 160, 181, 250, 194, 255, 253, 255,},

-  {112, 157, 231, 160, 181, 251, 195, 255, 253, 255,},

-  {113, 158, 232, 161, 182, 251, 195, 255, 253, 255,},

-  {114, 159, 232, 162, 182, 251, 196, 255, 253, 255,},

-  {115, 160, 233, 162, 183, 251, 197, 255, 253, 255,},

-  {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,},

-  {117, 162, 234, 164, 184, 252, 198, 255, 254, 255,},

-  {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,},

-  {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,},

-  {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,},

-  {121, 166, 236, 167, 186, 252, 201, 255, 254, 255,},

-  {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,},

-  {123, 168, 237, 168, 186, 253, 203, 255, 254, 255,},

-  {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,},

-  {125, 170, 238, 169, 187, 253, 204, 255, 254, 255,},

-  {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,},

-  {127, 172, 239, 171, 188, 253, 206, 255, 254, 255,},

-  {128, 173, 240, 171, 189, 253, 207, 255, 255, 255,},

-  {129, 174, 240, 172, 189, 253, 207, 255, 255, 255,},

-  {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,},

-  {131, 176, 241, 174, 190, 254, 209, 255, 255, 255,},

-  {132, 177, 241, 174, 191, 254, 209, 255, 255, 255,},

-  {133, 178, 242, 175, 191, 254, 210, 255, 255, 255,},

-  {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,},

-  {135, 180, 243, 176, 192, 254, 212, 255, 255, 255,},

-  {136, 180, 243, 177, 193, 254, 212, 255, 255, 255,},

-  {137, 181, 243, 178, 193, 254, 213, 255, 255, 255,},

-  {138, 182, 244, 179, 194, 254, 214, 255, 255, 255,},

-  {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,},

-  {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,},

-  {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,},

-  {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,},

-  {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,},

-  {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,},

-  {145, 189, 246, 183, 197, 255, 218, 255, 255, 255,},

-  {146, 190, 246, 184, 198, 255, 219, 255, 255, 255,},

-  {147, 191, 247, 185, 198, 255, 220, 255, 255, 255,},

-  {148, 191, 247, 186, 199, 255, 220, 255, 255, 255,},

-  {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,},

-  {150, 193, 248, 187, 200, 255, 221, 255, 255, 255,},

-  {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,},

-  {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,},

-  {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,},

-  {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,},

-  {155, 198, 249, 191, 202, 255, 224, 255, 255, 255,},

-  {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,},

-  {157, 199, 249, 192, 203, 255, 226, 255, 255, 255,},

-  {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,},

-  {159, 201, 250, 193, 204, 255, 227, 255, 255, 255,},

-  {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,},

-  {161, 203, 250, 195, 206, 255, 228, 255, 255, 255,},

-  {162, 203, 250, 196, 206, 255, 228, 255, 255, 255,},

-  {163, 204, 251, 196, 207, 255, 229, 255, 255, 255,},

-  {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,},

-  {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,},

-  {166, 207, 251, 198, 208, 255, 231, 255, 255, 255,},

-  {167, 207, 251, 199, 209, 255, 231, 255, 255, 255,},

-  {168, 208, 252, 200, 209, 255, 232, 255, 255, 255,},

-  {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,},

-  {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,},

-  {171, 211, 252, 202, 211, 255, 233, 255, 255, 255,},

-  {172, 211, 252, 203, 211, 255, 234, 255, 255, 255,},

-  {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,},

-  {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,},

-  {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,},

-  {176, 214, 253, 206, 213, 255, 236, 255, 255, 255,},

-  {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,},

-  {178, 216, 253, 207, 214, 255, 237, 255, 255, 255,},

-  {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,},

-  {180, 217, 253, 208, 216, 255, 237, 255, 255, 255,},

-  {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,},

-  {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,},

-  {183, 220, 254, 211, 217, 255, 239, 255, 255, 255,},

-  {184, 220, 254, 211, 218, 255, 239, 255, 255, 255,},

-  {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,},

-  {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,},

-  {187, 222, 254, 213, 219, 255, 241, 255, 255, 255,},

-  {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,},

-  {189, 224, 254, 215, 220, 255, 241, 255, 255, 255,},

-  {190, 225, 254, 215, 221, 255, 242, 255, 255, 255,},

-  {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,},

-  {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,},

-  {193, 227, 255, 218, 223, 255, 243, 255, 255, 255,},

-  {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,},

-  {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,},

-  {196, 229, 255, 220, 224, 255, 244, 255, 255, 255,},

-  {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,},

-  {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,},

-  {199, 230, 255, 222, 226, 255, 245, 255, 255, 255,},

-  {200, 231, 255, 222, 226, 255, 246, 255, 255, 255,},

-  {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,},

-  {202, 232, 255, 224, 228, 255, 246, 255, 255, 255,},

-  {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,},

-  {204, 234, 255, 225, 229, 255, 247, 255, 255, 255,},

-  {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,},

-  {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,},

-  {207, 235, 255, 227, 230, 255, 248, 255, 255, 255,},

-  {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,},

-  {209, 237, 255, 229, 231, 255, 248, 255, 255, 255,},

-  {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,},

-  {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,},

-  {212, 238, 255, 231, 233, 255, 249, 255, 255, 255,},

-  {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,},

-  {214, 239, 255, 232, 234, 255, 250, 255, 255, 255,},

-  {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,},

-  {216, 241, 255, 233, 235, 255, 250, 255, 255, 255,},

-  {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,},

-  {218, 242, 255, 235, 236, 255, 251, 255, 255, 255,},

-  {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,},

-  {220, 243, 255, 236, 237, 255, 251, 255, 255, 255,},

-  {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,},

-  {222, 244, 255, 237, 239, 255, 252, 255, 255, 255,},

-  {223, 244, 255, 238, 239, 255, 252, 255, 255, 255,},

-  {224, 245, 255, 238, 240, 255, 252, 255, 255, 255,},

-  {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,},

-  {226, 246, 255, 240, 241, 255, 253, 255, 255, 255,},

-  {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,},

-  {228, 247, 255, 241, 242, 255, 253, 255, 255, 255,},

-  {229, 247, 255, 242, 242, 255, 253, 255, 255, 255,},

-  {230, 248, 255, 242, 243, 255, 253, 255, 255, 255,},

-  {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,},

-  {232, 248, 255, 243, 244, 255, 254, 255, 255, 255,},

-  {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,},

-  {234, 249, 255, 245, 245, 255, 254, 255, 255, 255,},

-  {235, 250, 255, 245, 246, 255, 254, 255, 255, 255,},

-  {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,},

-  {237, 251, 255, 246, 247, 255, 255, 255, 255, 255,},

-  {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,},

-  {239, 251, 255, 248, 248, 255, 255, 255, 255, 255,},

-  {240, 252, 255, 248, 248, 255, 255, 255, 255, 255,},

-  {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,},

-  {242, 252, 255, 249, 249, 255, 255, 255, 255, 255,},

-  {243, 253, 255, 250, 250, 255, 255, 255, 255, 255,},

-  {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,},

-  {245, 253, 255, 251, 251, 255, 255, 255, 255, 255,},

-  {246, 254, 255, 251, 251, 255, 255, 255, 255, 255,},

-  {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {248, 254, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {249, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

-  {250, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

-  {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

-  {252, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

-  {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

-  {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,}

-};

-const vp9_prob vp9_modelcoefprobs_gg625[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

-  // source model with varying quantizer step size for a uniform quantizer

-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

-  {1,   2,   6,  88, 130,  10,  88,  35,  94,  40,},

-  {2,   4,  11,  89, 131,  19,  90,  60,  99,  67,},

-  {3,   6,  15,  90, 132,  27,  92,  80, 103,  88,},

-  {4,   7,  20,  91, 132,  34,  93,  97, 107, 105,},

-  {5,   9,  24,  92, 133,  41,  94, 112, 110, 120,},

-  {6,  11,  28,  93, 134,  48,  95, 125, 113, 132,},

-  {7,  13,  33,  93, 134,  54,  97, 136, 116, 143,},

-  {8,  14,  36,  94, 135,  60,  98, 146, 119, 152,},

-  {9,  16,  40,  95, 135,  65,  99, 155, 122, 161,},

-  {10,  18,  44,  95, 136,  70, 100, 163, 125, 168,},

-  {11,  19,  48,  96, 136,  75, 101, 170, 127, 175,},

-  {12,  21,  51,  97, 137,  80, 102, 176, 130, 181,},

-  {13,  23,  55,  97, 137,  85, 102, 182, 132, 187,},

-  {14,  24,  58,  98, 138,  89, 103, 188, 135, 192,},

-  {15,  26,  61,  99, 138,  94, 104, 193, 137, 196,},

-  {16,  27,  64,  99, 139,  98, 105, 197, 140, 201,},

-  {17,  29,  67, 100, 139, 102, 106, 201, 142, 205,},

-  {18,  30,  70, 101, 140, 106, 107, 205, 144, 208,},

-  {19,  32,  73, 101, 140, 109, 108, 209, 146, 211,},

-  {20,  34,  76, 102, 140, 113, 109, 212, 148, 214,},

-  {21,  35,  79, 102, 141, 116, 109, 215, 151, 217,},

-  {22,  37,  82, 103, 141, 120, 110, 218, 153, 220,},

-  {23,  38,  85, 103, 142, 123, 111, 220, 155, 222,},

-  {24,  40,  87, 104, 142, 126, 112, 223, 157, 224,},

-  {25,  41,  90, 105, 143, 129, 113, 225, 159, 226,},

-  {26,  42,  93, 105, 143, 132, 113, 227, 161, 228,},

-  {27,  44,  95, 106, 143, 135, 114, 229, 162, 230,},

-  {28,  45,  98, 106, 144, 138, 115, 230, 164, 232,},

-  {29,  47, 100, 107, 144, 141, 116, 232, 166, 233,},

-  {30,  48, 103, 107, 145, 144, 117, 234, 168, 235,},

-  {31,  50, 105, 108, 145, 146, 117, 235, 170, 236,},

-  {32,  51, 107, 108, 145, 149, 118, 236, 171, 237,},

-  {33,  52, 110, 109, 146, 151, 119, 238, 173, 238,},

-  {34,  54, 112, 110, 146, 154, 120, 239, 175, 239,},

-  {35,  55, 114, 110, 147, 156, 120, 240, 176, 240,},

-  {36,  57, 116, 111, 147, 158, 121, 241, 178, 241,},

-  {37,  58, 119, 111, 147, 161, 122, 242, 180, 242,},

-  {38,  59, 121, 112, 148, 163, 123, 243, 181, 243,},

-  {39,  61, 123, 112, 148, 165, 123, 244, 183, 244,},

-  {40,  62, 125, 113, 148, 167, 124, 244, 184, 245,},

-  {41,  63, 127, 113, 149, 169, 125, 245, 186, 245,},

-  {42,  65, 129, 114, 149, 171, 126, 246, 187, 246,},

-  {43,  66, 131, 114, 150, 173, 126, 246, 188, 247,},

-  {44,  67, 133, 115, 150, 175, 127, 247, 190, 247,},

-  {45,  69, 135, 115, 150, 177, 128, 247, 191, 248,},

-  {46,  70, 136, 116, 151, 178, 129, 248, 193, 248,},

-  {47,  71, 138, 116, 151, 180, 129, 248, 194, 249,},

-  {48,  73, 140, 117, 151, 182, 130, 249, 195, 249,},

-  {49,  74, 142, 118, 152, 184, 131, 249, 197, 250,},

-  {50,  75, 144, 118, 152, 185, 131, 250, 198, 250,},

-  {51,  76, 145, 119, 153, 187, 132, 250, 199, 250,},

-  {52,  78, 147, 119, 153, 188, 133, 251, 200, 251,},

-  {53,  79, 149, 120, 153, 190, 134, 251, 201, 251,},

-  {54,  80, 151, 120, 154, 192, 134, 251, 203, 251,},

-  {55,  82, 152, 121, 154, 193, 135, 251, 204, 252,},

-  {56,  83, 154, 121, 154, 194, 136, 252, 205, 252,},

-  {57,  84, 155, 122, 155, 196, 136, 252, 206, 252,},

-  {58,  85, 157, 122, 155, 197, 137, 252, 207, 252,},

-  {59,  86, 158, 123, 156, 199, 138, 252, 208, 252,},

-  {60,  88, 160, 123, 156, 200, 139, 253, 209, 253,},

-  {61,  89, 162, 124, 156, 201, 139, 253, 210, 253,},

-  {62,  90, 163, 124, 157, 202, 140, 253, 211, 253,},

-  {63,  91, 164, 125, 157, 204, 141, 253, 212, 253,},

-  {64,  93, 166, 125, 157, 205, 141, 253, 213, 253,},

-  {65,  94, 167, 126, 158, 206, 142, 254, 214, 254,},

-  {66,  95, 169, 126, 158, 207, 143, 254, 215, 254,},

-  {67,  96, 170, 127, 158, 208, 143, 254, 216, 254,},

-  {68,  97, 172, 127, 159, 209, 144, 254, 217, 254,},

-  {69,  98, 173, 128, 159, 210, 145, 254, 218, 254,},

-  {70, 100, 174, 128, 160, 212, 146, 254, 219, 254,},

-  {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,},

-  {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,},

-  {73, 103, 178, 130, 161, 215, 148, 255, 221, 255,},

-  {74, 104, 179, 131, 161, 216, 148, 255, 222, 255,},

-  {75, 105, 181, 131, 161, 217, 149, 255, 223, 255,},

-  {76, 107, 182, 132, 162, 217, 150, 255, 224, 255,},

-  {77, 108, 183, 132, 162, 218, 150, 255, 224, 255,},

-  {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,},

-  {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,},

-  {80, 111, 187, 134, 163, 221, 153, 255, 227, 255,},

-  {81, 112, 188, 134, 164, 222, 153, 255, 227, 255,},

-  {82, 113, 189, 135, 164, 223, 154, 255, 228, 255,},

-  {83, 115, 190, 135, 164, 223, 155, 255, 229, 255,},

-  {84, 116, 191, 136, 165, 224, 155, 255, 229, 255,},

-  {85, 117, 192, 136, 165, 225, 156, 255, 230, 255,},

-  {86, 118, 193, 137, 165, 226, 157, 255, 231, 255,},

-  {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,},

-  {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,},

-  {89, 121, 196, 139, 167, 228, 159, 255, 232, 255,},

-  {90, 122, 197, 139, 167, 229, 159, 255, 233, 255,},

-  {91, 123, 198, 140, 167, 229, 160, 255, 234, 255,},

-  {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,},

-  {93, 125, 200, 141, 168, 231, 162, 255, 235, 255,},

-  {94, 127, 201, 141, 168, 231, 162, 255, 235, 255,},

-  {95, 128, 202, 142, 169, 232, 163, 255, 236, 255,},

-  {96, 129, 203, 142, 169, 232, 164, 255, 236, 255,},

-  {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,},

-  {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,},

-  {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,},

-  {100, 133, 207, 144, 171, 235, 166, 255, 238, 255,},

-  {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,},

-  {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,},

-  {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,},

-  {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,},

-  {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,},

-  {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,},

-  {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,},

-  {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,},

-  {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,},

-  {110, 143, 215, 150, 175, 240, 173, 255, 242, 255,},

-  {111, 144, 216, 151, 175, 240, 174, 255, 243, 255,},

-  {112, 145, 217, 151, 175, 240, 174, 255, 243, 255,},

-  {113, 146, 217, 152, 176, 241, 175, 255, 244, 255,},

-  {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,},

-  {115, 148, 219, 153, 176, 242, 177, 255, 244, 255,},

-  {116, 149, 219, 153, 177, 242, 177, 255, 245, 255,},

-  {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,},

-  {118, 151, 221, 155, 178, 243, 179, 255, 245, 255,},

-  {119, 152, 222, 155, 178, 243, 179, 255, 245, 255,},

-  {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,},

-  {121, 154, 223, 156, 179, 244, 181, 255, 246, 255,},

-  {122, 155, 224, 157, 179, 244, 181, 255, 246, 255,},

-  {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,},

-  {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,},

-  {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,},

-  {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,},

-  {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,},

-  {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,},

-  {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,},

-  {130, 163, 228, 161, 182, 247, 187, 255, 248, 255,},

-  {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,},

-  {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,},

-  {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,},

-  {134, 166, 231, 164, 184, 248, 189, 255, 249, 255,},

-  {135, 167, 231, 164, 184, 248, 190, 255, 250, 255,},

-  {136, 168, 232, 165, 185, 248, 191, 255, 250, 255,},

-  {137, 169, 232, 166, 185, 248, 191, 255, 250, 255,},

-  {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,},

-  {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,},

-  {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,},

-  {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,},

-  {142, 174, 235, 169, 187, 250, 194, 255, 251, 255,},

-  {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,},

-  {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,},

-  {145, 177, 236, 170, 189, 250, 196, 255, 251, 255,},

-  {146, 177, 237, 171, 189, 250, 197, 255, 252, 255,},

-  {147, 178, 237, 172, 189, 251, 198, 255, 252, 255,},

-  {148, 179, 238, 172, 190, 251, 198, 255, 252, 255,},

-  {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,},

-  {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,},

-  {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,},

-  {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,},

-  {153, 184, 240, 175, 192, 252, 202, 255, 252, 255,},

-  {154, 184, 240, 176, 193, 252, 202, 255, 253, 255,},

-  {155, 185, 240, 177, 193, 252, 203, 255, 253, 255,},

-  {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,},

-  {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,},

-  {158, 188, 242, 178, 194, 252, 205, 255, 253, 255,},

-  {159, 189, 242, 179, 195, 252, 205, 255, 253, 255,},

-  {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,},

-  {161, 190, 243, 180, 196, 253, 207, 255, 253, 255,},

-  {162, 191, 243, 181, 196, 253, 207, 255, 254, 255,},

-  {163, 192, 243, 182, 197, 253, 208, 255, 254, 255,},

-  {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,},

-  {165, 194, 244, 183, 197, 253, 209, 255, 254, 255,},

-  {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,},

-  {167, 196, 245, 184, 198, 253, 210, 255, 254, 255,},

-  {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,},

-  {169, 197, 245, 186, 199, 254, 212, 255, 254, 255,},

-  {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,},

-  {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,},

-  {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,},

-  {173, 200, 246, 188, 201, 254, 214, 255, 254, 255,},

-  {174, 201, 247, 189, 202, 254, 215, 255, 254, 255,},

-  {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,},

-  {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,},

-  {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,},

-  {178, 204, 248, 191, 204, 254, 217, 255, 255, 255,},

-  {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,},

-  {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,},

-  {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,},

-  {182, 208, 249, 194, 205, 255, 220, 255, 255, 255,},

-  {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,},

-  {184, 209, 249, 196, 206, 255, 221, 255, 255, 255,},

-  {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,},

-  {186, 211, 250, 197, 207, 255, 222, 255, 255, 255,},

-  {187, 211, 250, 198, 208, 255, 223, 255, 255, 255,},

-  {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,},

-  {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,},

-  {190, 214, 251, 200, 209, 255, 224, 255, 255, 255,},

-  {191, 215, 251, 200, 210, 255, 225, 255, 255, 255,},

-  {192, 215, 251, 201, 211, 255, 225, 255, 255, 255,},

-  {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,},

-  {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,},

-  {195, 218, 252, 203, 212, 255, 227, 255, 255, 255,},

-  {196, 218, 252, 204, 213, 255, 228, 255, 255, 255,},

-  {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,},

-  {198, 220, 252, 205, 214, 255, 229, 255, 255, 255,},

-  {199, 221, 252, 206, 214, 255, 229, 255, 255, 255,},

-  {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,},

-  {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,},

-  {202, 223, 253, 208, 216, 255, 231, 255, 255, 255,},

-  {203, 223, 253, 209, 216, 255, 232, 255, 255, 255,},

-  {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,},

-  {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,},

-  {206, 226, 253, 211, 218, 255, 233, 255, 255, 255,},

-  {207, 226, 253, 212, 219, 255, 234, 255, 255, 255,},

-  {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,},

-  {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,},

-  {210, 228, 254, 214, 220, 255, 236, 255, 255, 255,},

-  {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,},

-  {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,},

-  {213, 230, 254, 217, 222, 255, 237, 255, 255, 255,},

-  {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,},

-  {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,},

-  {216, 233, 254, 219, 224, 255, 239, 255, 255, 255,},

-  {217, 233, 254, 220, 225, 255, 239, 255, 255, 255,},

-  {218, 234, 255, 220, 225, 255, 240, 255, 255, 255,},

-  {219, 235, 255, 221, 226, 255, 240, 255, 255, 255,},

-  {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,},

-  {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,},

-  {222, 237, 255, 224, 228, 255, 242, 255, 255, 255,},

-  {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,},

-  {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,},

-  {225, 238, 255, 226, 230, 255, 243, 255, 255, 255,},

-  {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,},

-  {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,},

-  {228, 240, 255, 228, 232, 255, 245, 255, 255, 255,},

-  {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,},

-  {230, 242, 255, 230, 233, 255, 246, 255, 255, 255,},

-  {231, 242, 255, 231, 234, 255, 246, 255, 255, 255,},

-  {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,},

-  {233, 243, 255, 233, 235, 255, 247, 255, 255, 255,},

-  {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,},

-  {235, 245, 255, 234, 236, 255, 248, 255, 255, 255,},

-  {236, 245, 255, 235, 237, 255, 248, 255, 255, 255,},

-  {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,},

-  {238, 247, 255, 237, 239, 255, 249, 255, 255, 255,},

-  {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,},

-  {240, 248, 255, 239, 240, 255, 250, 255, 255, 255,},

-  {241, 248, 255, 240, 241, 255, 251, 255, 255, 255,},

-  {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,},

-  {243, 249, 255, 241, 243, 255, 251, 255, 255, 255,},

-  {244, 250, 255, 242, 243, 255, 252, 255, 255, 255,},

-  {245, 251, 255, 243, 244, 255, 252, 255, 255, 255,},

-  {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,},

-  {247, 252, 255, 245, 246, 255, 253, 255, 255, 255,},

-  {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,},

-  {249, 253, 255, 247, 248, 255, 254, 255, 255, 255,},

-  {250, 253, 255, 248, 249, 255, 254, 255, 255, 255,},

-  {251, 254, 255, 249, 250, 255, 254, 255, 255, 255,},

-  {252, 254, 255, 251, 251, 255, 255, 255, 255, 255,},

-  {253, 255, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

-  {255, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+// beta = 8

+const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {

+  {  3,  86, 128,   6,  86,  23,  88,  29},

+  {  9,  86, 129,  17,  88,  61,  94,  76},

+  { 15,  87, 129,  28,  89,  93, 100, 110},

+  { 20,  88, 130,  38,  91, 118, 106, 136},

+  { 26,  89, 131,  48,  92, 139, 111, 156},

+  { 31,  90, 131,  58,  94, 156, 117, 171},

+  { 37,  90, 132,  66,  95, 171, 122, 184},

+  { 42,  91, 132,  75,  97, 183, 127, 194},

+  { 47,  92, 133,  83,  98, 193, 132, 202},

+  { 52,  93, 133,  90, 100, 201, 137, 208},

+  { 57,  94, 134,  98, 101, 208, 142, 214},

+  { 62,  94, 135, 105, 103, 214, 146, 218},

+  { 66,  95, 135, 111, 104, 219, 151, 222},

+  { 71,  96, 136, 117, 106, 224, 155, 225},

+  { 76,  97, 136, 123, 107, 227, 159, 228},

+  { 80,  98, 137, 129, 109, 231, 162, 231},

+  { 84,  98, 138, 134, 110, 234, 166, 233},

+  { 89,  99, 138, 140, 112, 236, 170, 235},

+  { 93, 100, 139, 145, 113, 238, 173, 236},

+  { 97, 101, 140, 149, 115, 240, 176, 238},

+  {101, 102, 140, 154, 116, 242, 179, 239},

+  {105, 103, 141, 158, 118, 243, 182, 240},

+  {109, 104, 141, 162, 119, 244, 185, 241},

+  {113, 104, 142, 166, 120, 245, 187, 242},

+  {116, 105, 143, 170, 122, 246, 190, 243},

+  {120, 106, 143, 173, 123, 247, 192, 244},

+  {123, 107, 144, 177, 125, 248, 195, 244},

+  {127, 108, 145, 180, 126, 249, 197, 245},

+  {130, 109, 145, 183, 128, 249, 199, 245},

+  {134, 110, 146, 186, 129, 250, 201, 246},

+  {137, 111, 147, 189, 131, 251, 203, 246},

+  {140, 112, 147, 192, 132, 251, 205, 247},

+  {143, 113, 148, 194, 133, 251, 207, 247},

+  {146, 114, 149, 197, 135, 252, 208, 248},

+  {149, 115, 149, 199, 136, 252, 210, 248},

+  {152, 115, 150, 201, 138, 252, 211, 248},

+  {155, 116, 151, 204, 139, 253, 213, 249},

+  {158, 117, 151, 206, 140, 253, 214, 249},

+  {161, 118, 152, 208, 142, 253, 216, 249},

+  {163, 119, 153, 210, 143, 253, 217, 249},

+  {166, 120, 153, 212, 144, 254, 218, 250},

+  {168, 121, 154, 213, 146, 254, 220, 250},

+  {171, 122, 155, 215, 147, 254, 221, 250},

+  {173, 123, 155, 217, 148, 254, 222, 250},

+  {176, 124, 156, 218, 150, 254, 223, 250},

+  {178, 125, 157, 220, 151, 254, 224, 251},

+  {180, 126, 157, 221, 152, 254, 225, 251},

+  {183, 127, 158, 222, 153, 254, 226, 251},

+  {185, 128, 159, 224, 155, 255, 227, 251},

+  {187, 129, 160, 225, 156, 255, 228, 251},

+  {189, 131, 160, 226, 157, 255, 228, 251},

+  {191, 132, 161, 227, 159, 255, 229, 251},

+  {193, 133, 162, 228, 160, 255, 230, 252},

+  {195, 134, 163, 230, 161, 255, 231, 252},

+  {197, 135, 163, 231, 162, 255, 231, 252},

+  {199, 136, 164, 232, 163, 255, 232, 252},

+  {201, 137, 165, 233, 165, 255, 233, 252},

+  {202, 138, 166, 233, 166, 255, 233, 252},

+  {204, 139, 166, 234, 167, 255, 234, 252},

+  {206, 140, 167, 235, 168, 255, 235, 252},

+  {207, 141, 168, 236, 169, 255, 235, 252},

+  {209, 142, 169, 237, 171, 255, 236, 252},

+  {210, 144, 169, 237, 172, 255, 236, 252},

+  {212, 145, 170, 238, 173, 255, 237, 252},

+  {214, 146, 171, 239, 174, 255, 237, 253},

+  {215, 147, 172, 240, 175, 255, 238, 253},

+  {216, 148, 173, 240, 176, 255, 238, 253},

+  {218, 149, 173, 241, 177, 255, 239, 253},

+  {219, 150, 174, 241, 179, 255, 239, 253},

+  {220, 152, 175, 242, 180, 255, 240, 253},

+  {222, 153, 176, 242, 181, 255, 240, 253},

+  {223, 154, 177, 243, 182, 255, 240, 253},

+  {224, 155, 178, 244, 183, 255, 241, 253},

+  {225, 156, 178, 244, 184, 255, 241, 253},

+  {226, 158, 179, 244, 185, 255, 242, 253},

+  {228, 159, 180, 245, 186, 255, 242, 253},

+  {229, 160, 181, 245, 187, 255, 242, 253},

+  {230, 161, 182, 246, 188, 255, 243, 253},

+  {231, 163, 183, 246, 189, 255, 243, 253},

+  {232, 164, 184, 247, 190, 255, 243, 253},

+  {233, 165, 185, 247, 191, 255, 244, 253},

+  {234, 166, 185, 247, 192, 255, 244, 253},

+  {235, 168, 186, 248, 193, 255, 244, 253},

+  {236, 169, 187, 248, 194, 255, 244, 253},

+  {236, 170, 188, 248, 195, 255, 245, 253},

+  {237, 171, 189, 249, 196, 255, 245, 254},

+  {238, 173, 190, 249, 197, 255, 245, 254},

+  {239, 174, 191, 249, 198, 255, 245, 254},

+  {240, 175, 192, 249, 199, 255, 246, 254},

+  {240, 177, 193, 250, 200, 255, 246, 254},

+  {241, 178, 194, 250, 201, 255, 246, 254},

+  {242, 179, 195, 250, 202, 255, 246, 254},

+  {242, 181, 196, 250, 203, 255, 247, 254},

+  {243, 182, 197, 251, 204, 255, 247, 254},

+  {244, 184, 198, 251, 205, 255, 247, 254},

+  {244, 185, 199, 251, 206, 255, 247, 254},

+  {245, 186, 200, 251, 207, 255, 247, 254},

+  {246, 188, 201, 252, 207, 255, 248, 254},

+  {246, 189, 202, 252, 208, 255, 248, 254},

+  {247, 191, 203, 252, 209, 255, 248, 254},

+  {247, 192, 204, 252, 210, 255, 248, 254},

+  {248, 194, 205, 252, 211, 255, 248, 254},

+  {248, 195, 206, 252, 212, 255, 249, 254},

+  {249, 197, 207, 253, 213, 255, 249, 254},

+  {249, 198, 208, 253, 214, 255, 249, 254},

+  {250, 200, 210, 253, 215, 255, 249, 254},

+  {250, 201, 211, 253, 215, 255, 249, 254},

+  {250, 203, 212, 253, 216, 255, 249, 254},

+  {251, 204, 213, 253, 217, 255, 250, 254},

+  {251, 206, 214, 254, 218, 255, 250, 254},

+  {252, 207, 216, 254, 219, 255, 250, 254},

+  {252, 209, 217, 254, 220, 255, 250, 254},

+  {252, 211, 218, 254, 221, 255, 250, 254},

+  {253, 213, 219, 254, 222, 255, 250, 254},

+  {253, 214, 221, 254, 223, 255, 250, 254},

+  {253, 216, 222, 254, 224, 255, 251, 254},

+  {253, 218, 224, 254, 225, 255, 251, 254},

+  {254, 220, 225, 254, 225, 255, 251, 254},

+  {254, 222, 227, 255, 226, 255, 251, 254},

+  {254, 224, 228, 255, 227, 255, 251, 254},

+  {254, 226, 230, 255, 228, 255, 251, 254},

+  {255, 228, 231, 255, 230, 255, 251, 254},

+  {255, 230, 233, 255, 231, 255, 252, 254},

+  {255, 232, 235, 255, 232, 255, 252, 254},

+  {255, 235, 237, 255, 233, 255, 252, 254},

+  {255, 238, 240, 255, 235, 255, 252, 255},

+  {255, 241, 243, 255, 236, 255, 252, 254},

+  {255, 246, 247, 255, 239, 255, 253, 255}

};

-const vp9_prob vp9_modelcoefprobs_gg875p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

-  // source model with varying quantizer step size for a uniform quantizer

-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

-  {1,   1,   3,  86, 128,   6,  86,  22,  89,  28,},

-  {1,   2,   6,  86, 129,  11,  87,  42,  92,  52,},

-  {2,   3,   9,  87, 129,  17,  88,  59,  94,  73,},

-  {2,   4,  12,  87, 129,  22,  89,  75,  97,  92,},

-  {3,   5,  14,  88, 130,  27,  89,  90, 100, 108,},

-  {3,   6,  17,  88, 130,  33,  90, 103, 102, 122,},

-  {4,   7,  20,  88, 130,  37,  91, 115, 105, 135,},

-  {4,   8,  23,  89, 131,  42,  92, 126, 108, 147,},

-  {5,   9,  25,  89, 131,  47,  92, 137, 110, 157,},

-  {5,  10,  28,  90, 131,  52,  93, 146, 113, 167,},

-  {6,  11,  31,  90, 132,  56,  94, 154, 115, 175,},

-  {6,  12,  33,  90, 132,  60,  94, 162, 118, 183,},

-  {7,  13,  36,  91, 132,  65,  95, 170, 120, 190,},

-  {7,  14,  39,  91, 132,  69,  96, 176, 123, 196,},

-  {8,  15,  41,  92, 133,  73,  96, 182, 125, 201,},

-  {8,  16,  44,  92, 133,  77,  97, 188, 128, 206,},

-  {9,  17,  46,  92, 133,  81,  98, 193, 130, 211,},

-  {9,  18,  49,  93, 134,  85,  99, 198, 133, 215,},

-  {10,  19,  51,  93, 134,  89,  99, 203, 135, 219,},

-  {10,  20,  54,  93, 134,  92, 100, 207, 137, 222,},

-  {11,  21,  56,  94, 134,  96, 101, 211, 140, 226,},

-  {12,  22,  58,  94, 135, 100, 101, 214, 142, 228,},

-  {12,  23,  61,  95, 135, 103, 102, 217, 145, 231,},

-  {13,  24,  63,  95, 135, 106, 103, 220, 147, 233,},

-  {13,  25,  66,  95, 136, 110, 103, 223, 149, 235,},

-  {14,  26,  68,  96, 136, 113, 104, 226, 151, 237,},

-  {14,  27,  70,  96, 136, 116, 105, 228, 154, 239,},

-  {15,  28,  72,  97, 136, 119, 106, 230, 156, 241,},

-  {15,  29,  75,  97, 137, 122, 106, 232, 158, 242,},

-  {16,  30,  77,  97, 137, 125, 107, 234, 160, 243,},

-  {17,  31,  79,  98, 137, 128, 108, 236, 163, 245,},

-  {17,  32,  81,  98, 138, 131, 108, 237, 165, 246,},

-  {18,  33,  83,  99, 138, 134, 109, 239, 167, 247,},

-  {18,  34,  86,  99, 138, 137, 110, 240, 169, 248,},

-  {19,  35,  88,  99, 138, 140, 111, 242, 171, 248,},

-  {19,  36,  90, 100, 139, 142, 111, 243, 173, 249,},

-  {20,  37,  92, 100, 139, 145, 112, 244, 175, 250,},

-  {20,  38,  94, 101, 139, 148, 113, 245, 177, 250,},

-  {21,  39,  96, 101, 140, 150, 113, 246, 179, 251,},

-  {22,  40,  98, 101, 140, 153, 114, 246, 181, 251,},

-  {22,  41, 100, 102, 140, 155, 115, 247, 183, 252,},

-  {23,  42, 102, 102, 140, 157, 116, 248, 185, 252,},

-  {23,  43, 104, 103, 141, 160, 116, 249, 186, 253,},

-  {24,  44, 106, 103, 141, 162, 117, 249, 188, 253,},

-  {25,  45, 108, 103, 141, 164, 118, 250, 190, 253,},

-  {25,  46, 110, 104, 142, 166, 119, 250, 192, 253,},

-  {26,  47, 112, 104, 142, 168, 119, 251, 193, 254,},

-  {26,  48, 114, 105, 142, 171, 120, 251, 195, 254,},

-  {27,  49, 116, 105, 143, 173, 121, 252, 197, 254,},

-  {27,  50, 118, 105, 143, 175, 122, 252, 198, 254,},

-  {28,  51, 119, 106, 143, 177, 122, 252, 200, 254,},

-  {29,  52, 121, 106, 143, 179, 123, 253, 201, 255,},

-  {29,  53, 123, 107, 144, 180, 124, 253, 203, 255,},

-  {30,  54, 125, 107, 144, 182, 125, 253, 204, 255,},

-  {30,  55, 127, 108, 144, 184, 125, 253, 206, 255,},

-  {31,  56, 128, 108, 145, 186, 126, 254, 207, 255,},

-  {32,  57, 130, 108, 145, 188, 127, 254, 209, 255,},

-  {32,  58, 132, 109, 145, 189, 128, 254, 210, 255,},

-  {33,  59, 134, 109, 146, 191, 128, 254, 211, 255,},

-  {33,  60, 135, 110, 146, 193, 129, 254, 213, 255,},

-  {34,  61, 137, 110, 146, 194, 130, 254, 214, 255,},

-  {35,  62, 139, 111, 146, 196, 131, 255, 215, 255,},

-  {35,  63, 140, 111, 147, 197, 131, 255, 216, 255,},

-  {36,  64, 142, 112, 147, 199, 132, 255, 218, 255,},

-  {37,  65, 144, 112, 147, 200, 133, 255, 219, 255,},

-  {37,  66, 145, 112, 148, 202, 134, 255, 220, 255,},

-  {38,  67, 147, 113, 148, 203, 135, 255, 221, 255,},

-  {38,  68, 148, 113, 148, 204, 135, 255, 222, 255,},

-  {39,  69, 150, 114, 149, 206, 136, 255, 223, 255,},

-  {40,  70, 151, 114, 149, 207, 137, 255, 224, 255,},

-  {40,  71, 153, 115, 149, 208, 138, 255, 225, 255,},

-  {41,  72, 154, 115, 150, 210, 138, 255, 226, 255,},

-  {42,  73, 156, 116, 150, 211, 139, 255, 227, 255,},

-  {42,  74, 157, 116, 150, 212, 140, 255, 228, 255,},

-  {43,  75, 159, 117, 151, 213, 141, 255, 229, 255,},

-  {44,  76, 160, 117, 151, 214, 142, 255, 230, 255,},

-  {44,  77, 162, 117, 151, 216, 142, 255, 231, 255,},

-  {45,  78, 163, 118, 152, 217, 143, 255, 231, 255,},

-  {45,  79, 165, 118, 152, 218, 144, 255, 232, 255,},

-  {46,  80, 166, 119, 152, 219, 145, 255, 233, 255,},

-  {47,  81, 167, 119, 153, 220, 146, 255, 234, 255,},

-  {47,  82, 169, 120, 153, 221, 146, 255, 235, 255,},

-  {48,  83, 170, 120, 153, 222, 147, 255, 235, 255,},

-  {49,  84, 171, 121, 154, 223, 148, 255, 236, 255,},

-  {49,  85, 173, 121, 154, 224, 149, 255, 237, 255,},

-  {50,  86, 174, 122, 154, 225, 150, 255, 237, 255,},

-  {51,  87, 175, 122, 155, 225, 150, 255, 238, 255,},

-  {51,  88, 177, 123, 155, 226, 151, 255, 239, 255,},

-  {52,  89, 178, 123, 155, 227, 152, 255, 239, 255,},

-  {53,  90, 179, 124, 156, 228, 153, 255, 240, 255,},

-  {53,  91, 180, 124, 156, 229, 154, 255, 240, 255,},

-  {54,  92, 182, 125, 156, 230, 154, 255, 241, 255,},

-  {55,  93, 183, 125, 157, 230, 155, 255, 241, 255,},

-  {55,  94, 184, 126, 157, 231, 156, 255, 242, 255,},

-  {56,  95, 185, 126, 157, 232, 157, 255, 242, 255,},

-  {57,  96, 187, 127, 158, 233, 158, 255, 243, 255,},

-  {57,  97, 188, 127, 158, 233, 159, 255, 243, 255,},

-  {58,  98, 189, 128, 158, 234, 159, 255, 244, 255,},

-  {59,  99, 190, 128, 159, 235, 160, 255, 244, 255,},

-  {60, 100, 191, 129, 159, 235, 161, 255, 245, 255,},

-  {60, 101, 192, 129, 160, 236, 162, 255, 245, 255,},

-  {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,},

-  {62, 103, 194, 131, 160, 237, 164, 255, 246, 255,},

-  {62, 104, 196, 131, 161, 238, 164, 255, 246, 255,},

-  {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,},

-  {64, 106, 198, 132, 161, 239, 166, 255, 247, 255,},

-  {64, 107, 199, 133, 162, 239, 167, 255, 247, 255,},

-  {65, 108, 200, 133, 162, 240, 168, 255, 248, 255,},

-  {66, 109, 201, 134, 163, 241, 168, 255, 248, 255,},

-  {67, 110, 202, 134, 163, 241, 169, 255, 248, 255,},

-  {67, 111, 203, 135, 163, 242, 170, 255, 249, 255,},

-  {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,},

-  {69, 113, 205, 136, 164, 242, 172, 255, 249, 255,},

-  {69, 114, 206, 137, 164, 243, 173, 255, 250, 255,},

-  {70, 115, 207, 137, 165, 243, 173, 255, 250, 255,},

-  {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,},

-  {72, 117, 208, 138, 166, 244, 175, 255, 250, 255,},

-  {72, 118, 209, 139, 166, 245, 176, 255, 251, 255,},

-  {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,},

-  {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,},

-  {75, 121, 212, 141, 167, 246, 178, 255, 251, 255,},

-  {75, 122, 213, 141, 168, 246, 179, 255, 251, 255,},

-  {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,},

-  {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,},

-  {78, 125, 215, 143, 169, 247, 182, 255, 252, 255,},

-  {78, 126, 216, 144, 169, 247, 182, 255, 252, 255,},

-  {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,},

-  {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,},

-  {81, 129, 219, 145, 170, 248, 185, 255, 253, 255,},

-  {82, 130, 219, 146, 171, 249, 186, 255, 253, 255,},

-  {82, 131, 220, 147, 171, 249, 187, 255, 253, 255,},

-  {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,},

-  {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,},

-  {85, 134, 222, 148, 173, 250, 189, 255, 253, 255,},

-  {85, 135, 223, 149, 173, 250, 190, 255, 254, 255,},

-  {86, 136, 224, 150, 173, 250, 191, 255, 254, 255,},

-  {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,},

-  {88, 138, 225, 151, 174, 251, 192, 255, 254, 255,},

-  {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,},

-  {89, 140, 227, 152, 175, 251, 194, 255, 254, 255,},

-  {90, 141, 227, 153, 176, 251, 195, 255, 254, 255,},

-  {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,},

-  {92, 143, 229, 154, 176, 252, 196, 255, 254, 255,},

-  {93, 144, 229, 155, 177, 252, 197, 255, 254, 255,},

-  {93, 145, 230, 155, 177, 252, 198, 255, 255, 255,},

-  {94, 146, 231, 156, 178, 252, 199, 255, 255, 255,},

-  {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,},

-  {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,},

-  {97, 149, 232, 158, 179, 253, 201, 255, 255, 255,},

-  {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,},

-  {99, 151, 234, 159, 180, 253, 202, 255, 255, 255,},

-  {99, 152, 234, 160, 181, 253, 203, 255, 255, 255,},

-  {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,},

-  {101, 154, 235, 162, 182, 253, 205, 255, 255, 255,},

-  {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,},

-  {103, 156, 236, 163, 183, 254, 206, 255, 255, 255,},

-  {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,},

-  {105, 158, 237, 164, 183, 254, 208, 255, 255, 255,},

-  {105, 159, 238, 165, 184, 254, 209, 255, 255, 255,},

-  {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,},

-  {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,},

-  {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,},

-  {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,},

-  {110, 164, 240, 169, 186, 254, 212, 255, 255, 255,},

-  {111, 165, 241, 169, 187, 254, 213, 255, 255, 255,},

-  {112, 166, 241, 170, 187, 255, 214, 255, 255, 255,},

-  {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,},

-  {114, 168, 242, 172, 189, 255, 215, 255, 255, 255,},

-  {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,},

-  {115, 170, 243, 173, 190, 255, 217, 255, 255, 255,},

-  {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,},

-  {117, 172, 244, 175, 191, 255, 218, 255, 255, 255,},

-  {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,},

-  {119, 174, 244, 176, 192, 255, 220, 255, 255, 255,},

-  {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,},

-  {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,},

-  {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,},

-  {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,},

-  {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,},

-  {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,},

-  {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,},

-  {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,},

-  {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,},

-  {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,},

-  {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,},

-  {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,},

-  {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,},

-  {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,},

-  {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,},

-  {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,},

-  {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,},

-  {137, 192, 250, 191, 202, 255, 231, 255, 255, 255,},

-  {138, 193, 250, 191, 203, 255, 232, 255, 255, 255,},

-  {139, 194, 250, 192, 203, 255, 233, 255, 255, 255,},

-  {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,},

-  {142, 196, 251, 194, 204, 255, 234, 255, 255, 255,},

-  {143, 197, 251, 195, 205, 255, 234, 255, 255, 255,},

-  {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,},

-  {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,},

-  {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,},

-  {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,},

-  {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,},

-  {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,},

-  {151, 204, 253, 201, 210, 255, 238, 255, 255, 255,},

-  {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,},

-  {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,},

-  {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,},

-  {155, 208, 253, 205, 212, 255, 241, 255, 255, 255,},

-  {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,},

-  {158, 210, 253, 206, 214, 255, 242, 255, 255, 255,},

-  {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,},

-  {160, 212, 254, 208, 215, 255, 243, 255, 255, 255,},

-  {162, 213, 254, 209, 216, 255, 243, 255, 255, 255,},

-  {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,},

-  {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,},

-  {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,},

-  {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,},

-  {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,},

-  {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,},

-  {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,},

-  {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,},

-  {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,},

-  {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,},

-  {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,},

-  {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,},

-  {179, 226, 255, 222, 226, 255, 249, 255, 255, 255,},

-  {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,},

-  {182, 228, 255, 224, 227, 255, 250, 255, 255, 255,},

-  {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,},

-  {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,},

-  {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,},

-  {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,},

-  {190, 233, 255, 229, 232, 255, 251, 255, 255, 255,},

-  {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,},

-  {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,},

-  {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,},

-  {197, 237, 255, 233, 235, 255, 253, 255, 255, 255,},

-  {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,},

-  {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,},

-  {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,},

-  {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,},

-  {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,},

-  {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,},

-  {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,},

-  {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,},

-  {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,},

-  {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,},

-  {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,},

-  {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,},

-  {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,},

-  {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,},

-  {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,},

-  {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

-  {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

-};

+static void extend_model_to_full_distribution(vp9_prob p,

+                                              vp9_prob *tree_probs) {

+  const int l = ((p - 1) / 2);

+  const vp9_prob (*model)[MODEL_NODES];

+  model = vp9_modelcoefprobs_pareto8;

+  if (p & 1) {

+    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,

+               model[l], MODEL_NODES * sizeof(vp9_prob));

+  } else {

+    // interpolate

+    int i;

+    for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)

+      tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] +

+                       model[l + 1][i - UNCONSTRAINED_NODES]) >> 1;

+  }

+}

-const vp9_prob vp9_modelcoefprobs_gg75p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

-  // source model with varying quantizer step size for a uniform quantizer

-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

-  {1,   1,   3,  86, 129,   6,  87,  21,  90,  26,},

-  {1,   2,   6,  87, 129,  11,  88,  39,  93,  47,},

-  {2,   3,   9,  87, 130,  16,  89,  55,  96,  65,},

-  {2,   4,  11,  88, 130,  21,  89,  69,  98,  81,},

-  {3,   5,  14,  88, 130,  26,  90,  82, 101,  95,},

-  {3,   6,  17,  89, 131,  31,  91,  94, 103, 107,},

-  {4,   7,  20,  89, 131,  35,  92, 105, 105, 119,},

-  {4,   8,  22,  90, 131,  40,  92, 115, 108, 129,},

-  {5,   9,  25,  90, 132,  44,  93, 124, 110, 138,},

-  {5,  10,  27,  91, 132,  48,  94, 133, 112, 147,},

-  {6,  11,  30,  91, 132,  52,  95, 141, 114, 155,},

-  {6,  12,  32,  92, 133,  56,  95, 148, 116, 162,},

-  {7,  13,  35,  92, 133,  60,  96, 155, 118, 168,},

-  {7,  14,  37,  92, 133,  64,  97, 161, 121, 174,},

-  {8,  15,  40,  93, 134,  68,  97, 167, 123, 180,},

-  {9,  16,  42,  93, 134,  71,  98, 173, 125, 185,},

-  {9,  17,  44,  94, 134,  75,  99, 178, 127, 190,},

-  {10,  18,  47,  94, 135,  78,  99, 182, 129, 195,},

-  {10,  19,  49,  94, 135,  82, 100, 187, 131, 199,},

-  {11,  20,  51,  95, 135,  85, 100, 191, 133, 202,},

-  {11,  21,  54,  95, 135,  88, 101, 195, 135, 206,},

-  {12,  22,  56,  96, 136,  92, 102, 199, 137, 209,},

-  {13,  23,  58,  96, 136,  95, 102, 202, 138, 213,},

-  {13,  24,  61,  96, 136,  98, 103, 206, 140, 215,},

-  {14,  25,  63,  97, 137, 101, 104, 209, 142, 218,},

-  {14,  26,  65,  97, 137, 104, 104, 211, 144, 221,},

-  {15,  27,  67,  98, 137, 107, 105, 214, 146, 223,},

-  {15,  28,  69,  98, 138, 110, 106, 217, 148, 225,},

-  {16,  29,  71,  98, 138, 113, 106, 219, 150, 227,},

-  {17,  30,  73,  99, 138, 115, 107, 221, 151, 229,},

-  {17,  31,  76,  99, 138, 118, 107, 223, 153, 231,},

-  {18,  32,  78, 100, 139, 121, 108, 225, 155, 232,},

-  {18,  33,  80, 100, 139, 123, 109, 227, 157, 234,},

-  {19,  34,  82, 100, 139, 126, 109, 229, 158, 235,},

-  {20,  35,  84, 101, 140, 128, 110, 231, 160, 237,},

-  {20,  36,  86, 101, 140, 131, 111, 232, 162, 238,},

-  {21,  37,  88, 102, 140, 133, 111, 234, 164, 239,},

-  {21,  38,  90, 102, 140, 136, 112, 235, 165, 240,},

-  {22,  39,  92, 102, 141, 138, 112, 236, 167, 241,},

-  {23,  40,  94, 103, 141, 140, 113, 237, 169, 242,},

-  {23,  41,  95, 103, 141, 143, 114, 238, 170, 243,},

-  {24,  42,  97, 103, 142, 145, 114, 240, 172, 244,},

-  {25,  43,  99, 104, 142, 147, 115, 241, 173, 245,},

-  {25,  44, 101, 104, 142, 149, 116, 242, 175, 246,},

-  {26,  45, 103, 105, 142, 151, 116, 242, 176, 246,},

-  {26,  46, 105, 105, 143, 153, 117, 243, 178, 247,},

-  {27,  47, 107, 105, 143, 156, 117, 244, 180, 248,},

-  {28,  48, 108, 106, 143, 158, 118, 245, 181, 248,},

-  {28,  49, 110, 106, 144, 159, 119, 245, 182, 249,},

-  {29,  50, 112, 107, 144, 161, 119, 246, 184, 249,},

-  {30,  51, 114, 107, 144, 163, 120, 247, 185, 250,},

-  {30,  52, 115, 108, 144, 165, 121, 247, 187, 250,},

-  {31,  53, 117, 108, 145, 167, 121, 248, 188, 250,},

-  {32,  54, 119, 108, 145, 169, 122, 248, 190, 251,},

-  {32,  55, 121, 109, 145, 171, 123, 249, 191, 251,},

-  {33,  56, 122, 109, 146, 172, 123, 249, 192, 251,},

-  {34,  57, 124, 110, 146, 174, 124, 250, 194, 252,},

-  {34,  58, 126, 110, 146, 176, 125, 250, 195, 252,},

-  {35,  59, 127, 110, 147, 177, 125, 250, 196, 252,},

-  {36,  60, 129, 111, 147, 179, 126, 251, 197, 253,},

-  {36,  61, 130, 111, 147, 181, 127, 251, 199, 253,},

-  {37,  62, 132, 112, 147, 182, 127, 251, 200, 253,},

-  {38,  63, 134, 112, 148, 184, 128, 252, 201, 253,},

-  {38,  64, 135, 112, 148, 185, 128, 252, 202, 253,},

-  {39,  65, 137, 113, 148, 187, 129, 252, 204, 254,},

-  {40,  66, 138, 113, 149, 188, 130, 253, 205, 254,},

-  {40,  67, 140, 114, 149, 190, 130, 253, 206, 254,},

-  {41,  68, 141, 114, 149, 191, 131, 253, 207, 254,},

-  {42,  69, 143, 115, 150, 192, 132, 253, 208, 254,},

-  {42,  70, 144, 115, 150, 194, 132, 253, 209, 254,},

-  {43,  71, 146, 115, 150, 195, 133, 254, 210, 254,},

-  {44,  72, 147, 116, 150, 197, 134, 254, 211, 255,},

-  {44,  73, 149, 116, 151, 198, 134, 254, 212, 255,},

-  {45,  74, 150, 117, 151, 199, 135, 254, 213, 255,},

-  {46,  75, 152, 117, 151, 200, 136, 254, 214, 255,},

-  {46,  76, 153, 118, 152, 202, 136, 254, 215, 255,},

-  {47,  77, 154, 118, 152, 203, 137, 254, 216, 255,},

-  {48,  78, 156, 119, 152, 204, 138, 254, 217, 255,},

-  {49,  79, 157, 119, 153, 205, 139, 255, 218, 255,},

-  {49,  80, 159, 119, 153, 206, 139, 255, 219, 255,},

-  {50,  81, 160, 120, 153, 207, 140, 255, 220, 255,},

-  {51,  82, 161, 120, 154, 208, 141, 255, 221, 255,},

-  {51,  83, 163, 121, 154, 210, 141, 255, 222, 255,},

-  {52,  84, 164, 121, 154, 211, 142, 255, 223, 255,},

-  {53,  85, 165, 122, 154, 212, 143, 255, 223, 255,},

-  {54,  86, 166, 122, 155, 213, 143, 255, 224, 255,},

-  {54,  87, 168, 123, 155, 214, 144, 255, 225, 255,},

-  {55,  88, 169, 123, 155, 215, 145, 255, 226, 255,},

-  {56,  89, 170, 123, 156, 216, 145, 255, 227, 255,},

-  {57,  90, 172, 124, 156, 217, 146, 255, 227, 255,},

-  {57,  91, 173, 124, 156, 218, 147, 255, 228, 255,},

-  {58,  92, 174, 125, 157, 218, 147, 255, 229, 255,},

-  {59,  93, 175, 125, 157, 219, 148, 255, 230, 255,},

-  {60,  94, 176, 126, 157, 220, 149, 255, 230, 255,},

-  {60,  95, 178, 126, 158, 221, 150, 255, 231, 255,},

-  {61,  96, 179, 127, 158, 222, 150, 255, 232, 255,},

-  {62,  97, 180, 127, 158, 223, 151, 255, 232, 255,},

-  {63,  98, 181, 128, 159, 224, 152, 255, 233, 255,},

-  {63,  99, 182, 128, 159, 224, 152, 255, 234, 255,},

-  {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,},

-  {65, 101, 184, 129, 160, 226, 154, 255, 235, 255,},

-  {66, 102, 186, 130, 160, 227, 154, 255, 235, 255,},

-  {66, 103, 187, 130, 160, 227, 155, 255, 236, 255,},

-  {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,},

-  {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,},

-  {69, 106, 190, 132, 161, 230, 157, 255, 238, 255,},

-  {69, 107, 191, 132, 162, 230, 158, 255, 238, 255,},

-  {70, 108, 192, 133, 162, 231, 159, 255, 239, 255,},

-  {71, 109, 193, 133, 163, 232, 159, 255, 239, 255,},

-  {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,},

-  {73, 111, 195, 134, 163, 233, 161, 255, 240, 255,},

-  {73, 112, 196, 135, 164, 233, 162, 255, 241, 255,},

-  {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,},

-  {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,},

-  {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,},

-  {77, 116, 200, 137, 165, 236, 165, 255, 242, 255,},

-  {77, 117, 201, 137, 165, 236, 165, 255, 243, 255,},

-  {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,},

-  {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,},

-  {80, 120, 204, 139, 166, 238, 167, 255, 244, 255,},

-  {81, 121, 205, 139, 167, 238, 168, 255, 244, 255,},

-  {82, 122, 206, 140, 167, 239, 169, 255, 245, 255,},

-  {82, 123, 206, 141, 168, 239, 170, 255, 245, 255,},

-  {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,},

-  {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,},

-  {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,},

-  {86, 127, 210, 143, 169, 241, 173, 255, 246, 255,},

-  {87, 128, 211, 143, 169, 242, 173, 255, 247, 255,},

-  {87, 129, 212, 144, 170, 242, 174, 255, 247, 255,},

-  {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,},

-  {89, 131, 213, 145, 171, 243, 176, 255, 248, 255,},

-  {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,},

-  {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,},

-  {92, 134, 216, 147, 172, 244, 178, 255, 248, 255,},

-  {93, 135, 216, 147, 172, 244, 179, 255, 249, 255,},

-  {93, 136, 217, 148, 173, 245, 179, 255, 249, 255,},

-  {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,},

-  {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,},

-  {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,},

-  {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,},

-  {98, 141, 221, 151, 175, 246, 183, 255, 250, 255,},

-  {99, 142, 222, 151, 175, 247, 184, 255, 250, 255,},

-  {100, 143, 222, 152, 175, 247, 184, 255, 251, 255,},

-  {100, 144, 223, 153, 176, 247, 185, 255, 251, 255,},

-  {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,},

-  {102, 146, 224, 154, 177, 248, 187, 255, 251, 255,},

-  {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,},

-  {104, 148, 226, 155, 178, 248, 188, 255, 252, 255,},

-  {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,},

-  {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,},

-  {107, 151, 228, 157, 179, 249, 190, 255, 252, 255,},

-  {108, 152, 228, 158, 179, 249, 191, 255, 252, 255,},

-  {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,},

-  {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,},

-  {111, 155, 230, 159, 181, 250, 193, 255, 253, 255,},

-  {111, 156, 231, 160, 181, 250, 194, 255, 253, 255,},

-  {112, 157, 231, 161, 181, 251, 195, 255, 253, 255,},

-  {113, 158, 232, 161, 182, 251, 196, 255, 253, 255,},

-  {114, 159, 233, 162, 182, 251, 196, 255, 253, 255,},

-  {115, 160, 233, 163, 183, 251, 197, 255, 253, 255,},

-  {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,},

-  {117, 162, 234, 164, 184, 252, 199, 255, 254, 255,},

-  {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,},

-  {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,},

-  {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,},

-  {121, 166, 236, 167, 186, 252, 202, 255, 254, 255,},

-  {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,},

-  {123, 168, 237, 168, 187, 253, 203, 255, 254, 255,},

-  {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,},

-  {125, 170, 238, 169, 188, 253, 205, 255, 254, 255,},

-  {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,},

-  {127, 172, 239, 171, 189, 253, 206, 255, 254, 255,},

-  {128, 173, 240, 172, 189, 253, 207, 255, 255, 255,},

-  {129, 174, 240, 172, 190, 253, 208, 255, 255, 255,},

-  {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,},

-  {131, 176, 241, 174, 191, 254, 209, 255, 255, 255,},

-  {132, 177, 242, 175, 191, 254, 210, 255, 255, 255,},

-  {133, 178, 242, 175, 192, 254, 210, 255, 255, 255,},

-  {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,},

-  {135, 180, 243, 177, 193, 254, 212, 255, 255, 255,},

-  {137, 181, 243, 177, 193, 254, 213, 255, 255, 255,},

-  {138, 182, 244, 178, 194, 254, 213, 255, 255, 255,},

-  {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,},

-  {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,},

-  {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,},

-  {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,},

-  {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,},

-  {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,},

-  {145, 189, 246, 184, 197, 255, 218, 255, 255, 255,},

-  {146, 190, 247, 184, 198, 255, 219, 255, 255, 255,},

-  {147, 191, 247, 185, 199, 255, 220, 255, 255, 255,},

-  {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,},

-  {150, 193, 247, 187, 200, 255, 221, 255, 255, 255,},

-  {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,},

-  {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,},

-  {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,},

-  {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,},

-  {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,},

-  {157, 199, 249, 192, 203, 255, 225, 255, 255, 255,},

-  {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,},

-  {159, 201, 250, 193, 205, 255, 227, 255, 255, 255,},

-  {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,},

-  {162, 203, 250, 195, 206, 255, 228, 255, 255, 255,},

-  {163, 204, 251, 196, 206, 255, 229, 255, 255, 255,},

-  {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,},

-  {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,},

-  {166, 207, 251, 199, 208, 255, 231, 255, 255, 255,},

-  {168, 208, 251, 200, 209, 255, 231, 255, 255, 255,},

-  {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,},

-  {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,},

-  {172, 211, 252, 202, 211, 255, 233, 255, 255, 255,},

-  {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,},

-  {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,},

-  {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,},

-  {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,},

-  {178, 216, 253, 207, 215, 255, 237, 255, 255, 255,},

-  {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,},

-  {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,},

-  {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,},

-  {184, 220, 254, 211, 217, 255, 239, 255, 255, 255,},

-  {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,},

-  {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,},

-  {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,},

-  {189, 224, 254, 215, 221, 255, 241, 255, 255, 255,},

-  {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,},

-  {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,},

-  {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,},

-  {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,},

-  {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,},

-  {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,},

-  {200, 231, 255, 222, 226, 255, 245, 255, 255, 255,},

-  {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,},

-  {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,},

-  {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,},

-  {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,},

-  {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,},

-  {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,},

-  {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,},

-  {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,},

-  {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,},

-  {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,},

-  {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,},

-  {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,},

-  {223, 244, 255, 237, 239, 255, 252, 255, 255, 255,},

-  {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,},

-  {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,},

-  {229, 247, 255, 241, 242, 255, 253, 255, 255, 255,},

-  {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,},

-  {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,},

-  {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,},

-  {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,},

-  {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,},

-  {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,},

-  {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,},

-  {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

-};

+void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {

+  if (full != model)

+    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);

+  extend_model_to_full_distribution(model[PIVOT_NODE], full);

+}

-const vp9_prob vp9_modelcoefprobs_gg625p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

-  // source model with varying quantizer step size for a uniform quantizer

-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

-  {1,   1,   3,  87, 129,   6,  87,  20,  91,  24,},

-  {1,   2,   6,  88, 130,  11,  89,  36,  94,  41,},

-  {2,   3,   8,  88, 130,  15,  90,  50,  97,  56,},

-  {2,   4,  11,  89, 131,  20,  90,  62,  99,  70,},

-  {3,   5,  14,  90, 131,  24,  91,  74, 102,  81,},

-  {3,   6,  16,  90, 132,  29,  92,  84, 104,  92,},

-  {4,   7,  19,  91, 132,  33,  93,  93, 106, 101,},

-  {4,   8,  21,  91, 132,  37,  93, 102, 108, 110,},

-  {5,   9,  24,  92, 133,  40,  94, 110, 110, 118,},

-  {5,  10,  26,  92, 133,  44,  95, 118, 111, 125,},

-  {6,  11,  29,  93, 134,  48,  96, 125, 113, 132,},

-  {7,  12,  31,  93, 134,  51,  96, 132, 115, 139,},

-  {7,  13,  33,  93, 134,  55,  97, 138, 117, 145,},

-  {8,  14,  36,  94, 135,  58,  97, 144, 119, 150,},

-  {8,  15,  38,  94, 135,  62,  98, 149, 120, 155,},

-  {9,  16,  40,  95, 135,  65,  99, 154, 122, 160,},

-  {10,  17,  42,  95, 136,  68,  99, 159, 124, 165,},

-  {10,  18,  45,  96, 136,  71, 100, 164, 125, 169,},

-  {11,  19,  47,  96, 136,  74, 100, 168, 127, 174,},

-  {11,  20,  49,  96, 136,  77, 101, 173, 128, 177,},

-  {12,  21,  51,  97, 137,  80, 102, 176, 130, 181,},

-  {13,  22,  53,  97, 137,  83, 102, 180, 131, 185,},

-  {13,  23,  55,  98, 137,  86, 103, 184, 133, 188,},

-  {14,  24,  57,  98, 138,  89, 103, 187, 135, 191,},

-  {14,  25,  59,  98, 138,  91, 104, 190, 136, 194,},

-  {15,  26,  61,  99, 138,  94, 104, 193, 138, 197,},

-  {16,  27,  64,  99, 139,  97, 105, 196, 139, 200,},

-  {16,  28,  66, 100, 139,  99, 106, 199, 141, 202,},

-  {17,  29,  68, 100, 139, 102, 106, 201, 142, 205,},

-  {18,  30,  69, 100, 139, 104, 107, 204, 143, 207,},

-  {18,  31,  71, 101, 140, 107, 107, 206, 145, 209,},

-  {19,  32,  73, 101, 140, 109, 108, 209, 146, 211,},

-  {20,  33,  75, 102, 140, 112, 108, 211, 148, 213,},

-  {20,  34,  77, 102, 141, 114, 109, 213, 149, 215,},

-  {21,  35,  79, 102, 141, 116, 109, 215, 150, 217,},

-  {22,  36,  81, 103, 141, 119, 110, 217, 152, 219,},

-  {22,  37,  83, 103, 141, 121, 110, 218, 153, 220,},

-  {23,  38,  85, 103, 142, 123, 111, 220, 155, 222,},

-  {24,  39,  87, 104, 142, 125, 112, 222, 156, 224,},

-  {24,  40,  88, 104, 142, 127, 112, 223, 157, 225,},

-  {25,  41,  90, 105, 143, 129, 113, 225, 159, 226,},

-  {26,  42,  92, 105, 143, 131, 113, 226, 160, 228,},

-  {26,  43,  94, 105, 143, 133, 114, 227, 161, 229,},

-  {27,  44,  95, 106, 143, 135, 114, 229, 162, 230,},

-  {28,  45,  97, 106, 144, 137, 115, 230, 164, 231,},

-  {28,  46,  99, 107, 144, 139, 115, 231, 165, 232,},

-  {29,  47, 101, 107, 144, 141, 116, 232, 166, 233,},

-  {30,  48, 102, 107, 145, 143, 116, 233, 168, 234,},

-  {31,  49, 104, 108, 145, 145, 117, 234, 169, 235,},

-  {31,  50, 106, 108, 145, 147, 118, 235, 170, 236,},

-  {32,  51, 107, 108, 145, 149, 118, 236, 171, 237,},

-  {33,  52, 109, 109, 146, 150, 119, 237, 172, 238,},

-  {33,  53, 111, 109, 146, 152, 119, 238, 174, 239,},

-  {34,  54, 112, 110, 146, 154, 120, 239, 175, 240,},

-  {35,  55, 114, 110, 146, 156, 120, 240, 176, 240,},

-  {36,  56, 115, 110, 147, 157, 121, 240, 177, 241,},

-  {36,  57, 117, 111, 147, 159, 121, 241, 178, 242,},

-  {37,  58, 119, 111, 147, 161, 122, 242, 180, 242,},

-  {38,  59, 120, 112, 148, 162, 122, 242, 181, 243,},

-  {38,  60, 122, 112, 148, 164, 123, 243, 182, 244,},

-  {39,  61, 123, 112, 148, 165, 124, 244, 183, 244,},

-  {40,  62, 125, 113, 148, 167, 124, 244, 184, 245,},

-  {41,  63, 126, 113, 149, 168, 125, 245, 185, 245,},

-  {41,  64, 128, 114, 149, 170, 125, 245, 186, 246,},

-  {42,  65, 129, 114, 149, 171, 126, 246, 187, 246,},

-  {43,  66, 131, 114, 150, 173, 126, 246, 188, 247,},

-  {44,  67, 132, 115, 150, 174, 127, 247, 189, 247,},

-  {44,  68, 134, 115, 150, 176, 127, 247, 191, 247,},

-  {45,  69, 135, 116, 150, 177, 128, 248, 192, 248,},

-  {46,  70, 136, 116, 151, 178, 129, 248, 193, 248,},

-  {47,  71, 138, 116, 151, 180, 129, 248, 194, 249,},

-  {48,  72, 139, 117, 151, 181, 130, 249, 195, 249,},

-  {48,  73, 141, 117, 152, 183, 130, 249, 196, 249,},

-  {49,  74, 142, 118, 152, 184, 131, 249, 197, 250,},

-  {50,  75, 143, 118, 152, 185, 131, 250, 198, 250,},

-  {51,  76, 145, 118, 152, 186, 132, 250, 199, 250,},

-  {51,  77, 146, 119, 153, 188, 132, 250, 200, 250,},

-  {52,  78, 148, 119, 153, 189, 133, 251, 201, 251,},

-  {53,  79, 149, 120, 153, 190, 134, 251, 201, 251,},

-  {54,  80, 150, 120, 154, 191, 134, 251, 202, 251,},

-  {55,  81, 151, 120, 154, 192, 135, 251, 203, 251,},

-  {55,  82, 153, 121, 154, 194, 135, 252, 204, 252,},

-  {56,  83, 154, 121, 155, 195, 136, 252, 205, 252,},

-  {57,  84, 155, 122, 155, 196, 136, 252, 206, 252,},

-  {58,  85, 157, 122, 155, 197, 137, 252, 207, 252,},

-  {59,  86, 158, 123, 155, 198, 138, 252, 208, 252,},

-  {59,  87, 159, 123, 156, 199, 138, 253, 209, 253,},

-  {60,  88, 160, 123, 156, 200, 139, 253, 210, 253,},

-  {61,  89, 162, 124, 156, 201, 139, 253, 210, 253,},

-  {62,  90, 163, 124, 157, 202, 140, 253, 211, 253,},

-  {63,  91, 164, 125, 157, 203, 140, 253, 212, 253,},

-  {64,  92, 165, 125, 157, 204, 141, 253, 213, 253,},

-  {64,  93, 166, 126, 158, 205, 142, 254, 214, 253,},

-  {65,  94, 168, 126, 158, 206, 142, 254, 214, 254,},

-  {66,  95, 169, 126, 158, 207, 143, 254, 215, 254,},

-  {67,  96, 170, 127, 158, 208, 143, 254, 216, 254,},

-  {68,  97, 171, 127, 159, 209, 144, 254, 217, 254,},

-  {69,  98, 172, 128, 159, 210, 145, 254, 218, 254,},

-  {69,  99, 173, 128, 159, 211, 145, 254, 218, 254,},

-  {70, 100, 175, 129, 160, 212, 146, 254, 219, 254,},

-  {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,},

-  {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,},

-  {73, 103, 178, 130, 161, 214, 148, 255, 221, 255,},

-  {74, 104, 179, 130, 161, 215, 148, 255, 222, 255,},

-  {75, 105, 180, 131, 161, 216, 149, 255, 223, 255,},

-  {75, 106, 181, 131, 162, 217, 149, 255, 223, 255,},

-  {76, 107, 182, 132, 162, 218, 150, 255, 224, 255,},

-  {77, 108, 183, 132, 162, 219, 151, 255, 225, 255,},

-  {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,},

-  {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,},

-  {80, 111, 186, 134, 163, 221, 152, 255, 226, 255,},

-  {81, 112, 187, 134, 164, 222, 153, 255, 227, 255,},

-  {82, 113, 188, 135, 164, 222, 154, 255, 228, 255,},

-  {83, 114, 189, 135, 164, 223, 154, 255, 228, 255,},

-  {83, 115, 190, 136, 165, 224, 155, 255, 229, 255,},

-  {84, 116, 191, 136, 165, 224, 156, 255, 230, 255,},

-  {85, 117, 192, 137, 165, 225, 156, 255, 230, 255,},

-  {86, 118, 193, 137, 166, 226, 157, 255, 231, 255,},

-  {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,},

-  {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,},

-  {89, 121, 196, 138, 167, 228, 159, 255, 232, 255,},

-  {90, 122, 197, 139, 167, 228, 159, 255, 233, 255,},

-  {91, 123, 198, 139, 167, 229, 160, 255, 233, 255,},

-  {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,},

-  {93, 125, 200, 140, 168, 230, 161, 255, 234, 255,},

-  {93, 126, 201, 141, 168, 231, 162, 255, 235, 255,},

-  {94, 127, 202, 141, 169, 231, 163, 255, 235, 255,},

-  {95, 128, 203, 142, 169, 232, 163, 255, 236, 255,},

-  {96, 129, 203, 142, 169, 233, 164, 255, 236, 255,},

-  {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,},

-  {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,},

-  {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,},

-  {100, 133, 207, 145, 171, 235, 166, 255, 238, 255,},

-  {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,},

-  {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,},

-  {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,},

-  {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,},

-  {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,},

-  {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,},

-  {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,},

-  {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,},

-  {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,},

-  {110, 143, 215, 150, 174, 240, 173, 255, 242, 255,},

-  {111, 144, 216, 150, 175, 240, 174, 255, 243, 255,},

-  {112, 145, 216, 151, 175, 240, 174, 255, 243, 255,},

-  {113, 146, 217, 152, 176, 241, 175, 255, 243, 255,},

-  {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,},

-  {115, 148, 219, 153, 176, 242, 176, 255, 244, 255,},

-  {116, 149, 219, 153, 177, 242, 177, 255, 244, 255,},

-  {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,},

-  {118, 151, 221, 154, 178, 243, 178, 255, 245, 255,},

-  {119, 152, 221, 155, 178, 243, 179, 255, 245, 255,},

-  {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,},

-  {121, 154, 223, 156, 179, 244, 180, 255, 246, 255,},

-  {122, 155, 223, 157, 179, 244, 181, 255, 246, 255,},

-  {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,},

-  {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,},

-  {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,},

-  {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,},

-  {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,},

-  {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,},

-  {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,},

-  {130, 163, 229, 162, 183, 247, 187, 255, 248, 255,},

-  {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,},

-  {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,},

-  {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,},

-  {135, 167, 231, 164, 184, 248, 190, 255, 249, 255,},

-  {136, 168, 232, 165, 185, 248, 190, 255, 250, 255,},

-  {137, 169, 232, 165, 185, 248, 191, 255, 250, 255,},

-  {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,},

-  {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,},

-  {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,},

-  {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,},

-  {142, 174, 235, 169, 187, 250, 195, 255, 251, 255,},

-  {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,},

-  {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,},

-  {146, 177, 236, 171, 189, 250, 197, 255, 251, 255,},

-  {147, 178, 237, 171, 189, 251, 197, 255, 252, 255,},

-  {148, 179, 237, 172, 190, 251, 198, 255, 252, 255,},

-  {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,},

-  {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,},

-  {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,},

-  {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,},

-  {153, 184, 240, 176, 192, 252, 202, 255, 253, 255,},

-  {155, 185, 240, 176, 193, 252, 203, 255, 253, 255,},

-  {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,},

-  {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,},

-  {158, 188, 242, 179, 194, 252, 205, 255, 253, 255,},

-  {159, 189, 242, 179, 195, 252, 206, 255, 253, 255,},

-  {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,},

-  {162, 191, 243, 181, 196, 253, 207, 255, 253, 255,},

-  {163, 192, 243, 182, 196, 253, 208, 255, 254, 255,},

-  {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,},

-  {165, 194, 244, 183, 198, 253, 209, 255, 254, 255,},

-  {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,},

-  {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,},

-  {169, 197, 245, 185, 199, 254, 212, 255, 254, 255,},

-  {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,},

-  {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,},

-  {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,},

-  {174, 201, 247, 189, 201, 254, 215, 255, 254, 255,},

-  {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,},

-  {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,},

-  {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,},

-  {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,},

-  {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,},

-  {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,},

-  {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,},

-  {184, 209, 249, 195, 206, 255, 221, 255, 255, 255,},

-  {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,},

-  {186, 211, 250, 197, 208, 255, 222, 255, 255, 255,},

-  {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,},

-  {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,},

-  {190, 214, 251, 200, 210, 255, 224, 255, 255, 255,},

-  {192, 215, 251, 201, 210, 255, 225, 255, 255, 255,},

-  {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,},

-  {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,},

-  {196, 218, 252, 204, 212, 255, 228, 255, 255, 255,},

-  {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,},

-  {198, 220, 252, 206, 214, 255, 229, 255, 255, 255,},

-  {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,},

-  {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,},

-  {202, 223, 253, 209, 216, 255, 231, 255, 255, 255,},

-  {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,},

-  {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,},

-  {207, 226, 253, 212, 218, 255, 234, 255, 255, 255,},

-  {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,},

-  {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,},

-  {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,},

-  {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,},

-  {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,},

-  {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,},

-  {217, 233, 254, 219, 224, 255, 239, 255, 255, 255,},

-  {218, 234, 255, 221, 225, 255, 240, 255, 255, 255,},

-  {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,},

-  {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,},

-  {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,},

-  {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,},

-  {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,},

-  {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,},

-  {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,},

-  {231, 242, 255, 231, 233, 255, 246, 255, 255, 255,},

-  {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,},

-  {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,},

-  {235, 245, 255, 235, 237, 255, 248, 255, 255, 255,},

-  {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,},

-  {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,},

-  {241, 248, 255, 239, 241, 255, 250, 255, 255, 255,},

-  {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,},

-  {244, 250, 255, 243, 243, 255, 252, 255, 255, 255,},

-  {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,},

-  {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,},

-  {250, 253, 255, 248, 248, 255, 254, 255, 255, 255,},

-  {252, 254, 255, 250, 250, 255, 255, 255, 255, 255,},

-  {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

-};

-void vp9_get_model_distribution(vp9_prob p, vp9_prob *tree_probs,

-                                int b, int r) {

-  const vp9_prob (*model)[ENTROPY_NODES - 1];

-#if UNCONSTRAINED_NODES == 2

-  if (r != INTRA_FRAME && b == PLANE_TYPE_UV)

-    model = vp9_modelcoefprobs_gg75;

-  else if (r == INTRA_FRAME && b == PLANE_TYPE_UV)

-    model = vp9_modelcoefprobs_gg75;

-  else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC)

-    model = vp9_modelcoefprobs_gg75;

-  else

-    model = vp9_modelcoefprobs_gg75;

-#else

-  if (r != INTRA_FRAME && b == PLANE_TYPE_UV)

-    model = vp9_modelcoefprobs_gg75p1;

-  else if (r == INTRA_FRAME && b == PLANE_TYPE_UV)

-    model = vp9_modelcoefprobs_gg75p1;

-  else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC)

-    model = vp9_modelcoefprobs_gg75p1;

-  else

-    model = vp9_modelcoefprobs_gg75p1;

-#endif

-  vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,

-             model[p] + UNCONSTRAINED_NODES - 1,

-             (ENTROPY_NODES - UNCONSTRAINED_NODES) * sizeof(vp9_prob));

+void vp9_model_to_full_probs_sb(

+    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],

+    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {

+  int c, p;

+  for (c = 0; c < COEF_BANDS; ++c)

+    for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {

+      vp9_model_to_full_probs(model[c][p], full[c][p]);

+    }

-#endif

 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];

@@ -2077,7 +439,7 @@

   init_bit_tree(cat6, 14);

-vp9_extra_bit_struct vp9_extra_bits[12] = {

+vp9_extra_bit vp9_extra_bits[12] = {

   { 0, 0, 0, 0},

   { 0, 0, 0, 1},

   { 0, 0, 0, 2},

@@ -2111,177 +473,32 @@

     int ctx;

     assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);

     if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {

-      ctx = (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +

-             token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;

+      ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +

+             token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;

     } else {

-      ctx = token_cache[neighbors[MAX_NEIGHBORS * c + 0]];

+      ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];

-    return vp9_pt_energy_class[ctx];

+    return ctx;

};

 void vp9_default_coef_probs(VP9_COMMON *pc) {

-#if CONFIG_MODELCOEFPROB

-  int b, r, c, p;

-#endif

-#if CONFIG_CODE_NONZEROCOUNT

-#ifdef NZC_DEFAULT_COUNTS

-  int h, g;

-  for (h = 0; h < MAX_NZC_CONTEXTS; ++h) {

-    for (g = 0; g < REF_TYPES; ++g) {

-      int i;

-      unsigned int branch_ct4x4[NZC4X4_NODES][2];

-      unsigned int branch_ct8x8[NZC8X8_NODES][2];

-      unsigned int branch_ct16x16[NZC16X16_NODES][2];

-      unsigned int branch_ct32x32[NZC32X32_NODES][2];

-      for (i = 0; i < BLOCK_TYPES; ++i) {

-        vp9_tree_probs_from_distribution(

-          vp9_nzc4x4_tree,

-          pc->fc.nzc_probs_4x4[h][g][i], branch_ct4x4,

-          default_nzc_counts_4x4[h][g][i], 0);

-      }

-      for (i = 0; i < BLOCK_TYPES; ++i) {

-        vp9_tree_probs_from_distribution(

-          vp9_nzc8x8_tree,

-          pc->fc.nzc_probs_8x8[h][g][i], branch_ct8x8,

-          default_nzc_counts_8x8[h][g][i], 0);

-      }

-      for (i = 0; i < BLOCK_TYPES; ++i) {

-        vp9_tree_probs_from_distribution(

-          vp9_nzc16x16_tree,

-          pc->fc.nzc_probs_16x16[h][g][i], branch_ct16x16,

-          default_nzc_counts_16x16[h][g][i], 0);

-      }

-      for (i = 0; i < BLOCK_TYPES; ++i) {

-        vp9_tree_probs_from_distribution(

-          vp9_nzc32x32_tree,

-          pc->fc.nzc_probs_32x32[h][g][i], branch_ct32x32,

-          default_nzc_counts_32x32[h][g][i], 0);

-      }

-    }

-  }

-#else

-  vpx_memcpy(pc->fc.nzc_probs_4x4, default_nzc_probs_4x4,

-             sizeof(pc->fc.nzc_probs_4x4));

-  vpx_memcpy(pc->fc.nzc_probs_8x8, default_nzc_probs_8x8,

-             sizeof(pc->fc.nzc_probs_8x8));

-  vpx_memcpy(pc->fc.nzc_probs_16x16, default_nzc_probs_16x16,

-             sizeof(pc->fc.nzc_probs_16x16));

-  vpx_memcpy(pc->fc.nzc_probs_32x32, default_nzc_probs_32x32,

-             sizeof(pc->fc.nzc_probs_32x32));

-#endif

-  vpx_memcpy(pc->fc.nzc_pcat_probs, default_nzc_pcat_probs,

-             sizeof(pc->fc.nzc_pcat_probs));

-#endif  // CONFIG_CODE_NONZEROCOUNT

-#if CONFIG_MODELCOEFPROB

-  for (b = 0; b < BLOCK_TYPES; ++b)

-    for (r = 0; r < REF_TYPES; ++r)

-      for (c = 0; c < COEF_BANDS; ++c)

-        for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {

-          int t;

-          for (t = 0; t < UNCONSTRAINED_NODES; t++)

-            pc->fc.coef_probs_4x4[b][r][c][p][t] =

-                default_coef_probs_4x4[b][r][c][p][t];

-          vp9_get_model_distribution(

-              default_coef_probs_4x4[b][r][c][p][UNCONSTRAINED_NODES - 1],

-              pc->fc.coef_probs_4x4[b][r][c][p], b, r);

-          for (t = 0; t < UNCONSTRAINED_NODES; t++)

-            pc->fc.coef_probs_8x8[b][r][c][p][t] =

-                default_coef_probs_8x8[b][r][c][p][t];

-          vp9_get_model_distribution(

-              default_coef_probs_8x8[b][r][c][p][UNCONSTRAINED_NODES - 1],

-              pc->fc.coef_probs_8x8[b][r][c][p], b, r);

-          for (t = 0; t < UNCONSTRAINED_NODES; t++)

-            pc->fc.coef_probs_16x16[b][r][c][p][t] =

-                default_coef_probs_16x16[b][r][c][p][t];

-          vp9_get_model_distribution(

-              default_coef_probs_16x16[b][r][c][p][UNCONSTRAINED_NODES - 1],

-              pc->fc.coef_probs_16x16[b][r][c][p], b, r);

-          for (t = 0; t < UNCONSTRAINED_NODES; t++)

-            pc->fc.coef_probs_32x32[b][r][c][p][t] =

-                default_coef_probs_32x32[b][r][c][p][t];

-          vp9_get_model_distribution(

-              default_coef_probs_32x32[b][r][c][p][UNCONSTRAINED_NODES - 1],

-              pc->fc.coef_probs_32x32[b][r][c][p], b, r);

-        }

-#else

-  vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,

-             sizeof(pc->fc.coef_probs_4x4));

-  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,

-             sizeof(pc->fc.coef_probs_8x8));

-  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,

-             sizeof(pc->fc.coef_probs_16x16));

-  vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,

-             sizeof(pc->fc.coef_probs_32x32));

-#endif

+  vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,

+             sizeof(pc->fc.coef_probs[TX_4X4]));

+  vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8,

+             sizeof(pc->fc.coef_probs[TX_8X8]));

+  vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16,

+             sizeof(pc->fc.coef_probs[TX_16X16]));

+  vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32,

+             sizeof(pc->fc.coef_probs[TX_32X32]));

-#if CONFIG_MODELCOEFPROB

-// This is a placeholder function that will enable the default coef probs to

-// change for key frames based on the base_qindex. If base_qindex is large,

-// we can expect probabilities of zeros to be bigger, and vice versa. The rest

-// of the probabilities are derived from the nodel.

-void vp9_adjust_default_coef_probs(VP9_COMMON *cm) {

-  static const int factor_bits = 4;

-  static const int factor_rnd = 8;   // (1 << (factor_bits - 1))

-  int b, r, c, p;

-  int factor = (1 << factor_bits);

-  /*

-  if (cm->base_qindex < 32)

-    factor -= ((32 - cm->base_qindex) >> 4);

-    */

-  if (cm->base_qindex > 128)

-    factor += ((cm->base_qindex - 128) >> 4);

-  // printf(" Q %d factor %d\n", cm->base_qindex, factor);

-  for (b = 0; b < BLOCK_TYPES; ++b)

-    for (r = 0; r < REF_TYPES; ++r)

-      for (c = 0; c < COEF_BANDS; ++c)

-        for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {

-          int t, x;

-          vp9_prob prob;

-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

-            x = (default_coef_probs_4x4[b][r][c][p][t] * factor + factor_rnd)

-                >> factor_bits;

-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

-            cm->fc.coef_probs_4x4[b][r][c][p][t] = prob;

-          }

-          vp9_get_model_distribution(

-              prob, cm->fc.coef_probs_4x4[b][r][c][p], b, r);

-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

-            x = (default_coef_probs_8x8[b][r][c][p][t] * factor + factor_rnd)

-                >> factor_bits;

-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

-            cm->fc.coef_probs_8x8[b][r][c][p][t] = prob;

-          }

-          vp9_get_model_distribution(

-              prob, cm->fc.coef_probs_8x8[b][r][c][p], b, r);

-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

-            x = (default_coef_probs_16x16[b][r][c][p][t] * factor + factor_rnd)

-                >> factor_bits;

-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

-            cm->fc.coef_probs_16x16[b][r][c][p][t] = prob;

-          }

-          vp9_get_model_distribution(

-              prob, cm->fc.coef_probs_16x16[b][r][c][p], b, r);

-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

-            x = (default_coef_probs_32x32[b][r][c][p][t] * factor + factor_rnd)

-                >> factor_bits;

-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

-            cm->fc.coef_probs_32x32[b][r][c][p][t] = prob;

-          }

-          vp9_get_model_distribution(

-              prob, cm->fc.coef_probs_32x32[b][r][c][p], b, r);

-        }

-}

-#endif

 // Neighborhood 5-tuples for various scans and blocksizes,

 // in {top, left, topleft, topright, bottomleft} order

 // for each position in raster scan order.

 // -1 indicates the neighbor does not exist.

 DECLARE_ALIGNED(16, int,

-                vp9_default_zig_zag1d_4x4_neighbors[16 * MAX_NEIGHBORS]);

+                vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

                 vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

@@ -2291,15 +508,15 @@

 DECLARE_ALIGNED(16, int,

                 vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

-                vp9_default_zig_zag1d_8x8_neighbors[64 * MAX_NEIGHBORS]);

+                vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

                 vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

                 vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

-                vp9_default_zig_zag1d_16x16_neighbors[256 * MAX_NEIGHBORS]);

+                vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

-                vp9_default_zig_zag1d_32x32_neighbors[1024 * MAX_NEIGHBORS]);

+                vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);

 static int find_in_scan(const int *scan, int l, int idx) {

   int n, l2 = l * l;

@@ -2361,32 +578,32 @@

 void vp9_init_neighbors() {

-  init_scan_neighbors(vp9_default_zig_zag1d_4x4, 4,

-                      vp9_default_zig_zag1d_4x4_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_default_scan_4x4, 4,

+                      vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_row_scan_4x4, 4,

                       vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_col_scan_4x4, 4,

                       vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);

-  init_scan_neighbors(vp9_default_zig_zag1d_8x8, 8,

-                      vp9_default_zig_zag1d_8x8_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_default_scan_8x8, 8,

+                      vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_row_scan_8x8, 8,

                       vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_col_scan_8x8, 8,

                       vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);

-  init_scan_neighbors(vp9_default_zig_zag1d_16x16, 16,

-                      vp9_default_zig_zag1d_16x16_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_default_scan_16x16, 16,

+                      vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_row_scan_16x16, 16,

                       vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_col_scan_16x16, 16,

                       vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);

-  init_scan_neighbors(vp9_default_zig_zag1d_32x32, 32,

-                      vp9_default_zig_zag1d_32x32_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_default_scan_32x32, 32,

+                      vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);

 const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) {

-  if (scan == vp9_default_zig_zag1d_4x4) {

+  if (scan == vp9_default_scan_4x4) {

     *pad = MAX_NEIGHBORS;

-    return vp9_default_zig_zag1d_4x4_neighbors;

+    return vp9_default_scan_4x4_neighbors;

   } else if (scan == vp9_row_scan_4x4) {

     *pad = MAX_NEIGHBORS;

     return vp9_row_scan_4x4_neighbors;

@@ -2393,9 +610,9 @@

   } else if (scan == vp9_col_scan_4x4) {

     *pad = MAX_NEIGHBORS;

     return vp9_col_scan_4x4_neighbors;

-  } else if (scan == vp9_default_zig_zag1d_8x8) {

+  } else if (scan == vp9_default_scan_8x8) {

     *pad = MAX_NEIGHBORS;

-    return vp9_default_zig_zag1d_8x8_neighbors;

+    return vp9_default_scan_8x8_neighbors;

   } else if (scan == vp9_row_scan_8x8) {

     *pad = 2;

     return vp9_row_scan_8x8_neighbors;

@@ -2402,9 +619,9 @@

   } else if (scan == vp9_col_scan_8x8) {

     *pad = 2;

     return vp9_col_scan_8x8_neighbors;

-  } else if (scan == vp9_default_zig_zag1d_16x16) {

+  } else if (scan == vp9_default_scan_16x16) {

     *pad = MAX_NEIGHBORS;

-    return vp9_default_zig_zag1d_16x16_neighbors;

+    return vp9_default_scan_16x16_neighbors;

   } else if (scan == vp9_row_scan_16x16) {

     *pad = 2;

     return vp9_row_scan_16x16_neighbors;

@@ -2411,9 +628,9 @@

   } else if (scan == vp9_col_scan_16x16) {

     *pad = 2;

     return vp9_col_scan_16x16_neighbors;

-  } else if (scan == vp9_default_zig_zag1d_32x32) {

+  } else if (scan == vp9_default_scan_32x32) {

     *pad = MAX_NEIGHBORS;

-    return vp9_default_zig_zag1d_32x32_neighbors;

+    return vp9_default_scan_32x32_neighbors;

   } else {

     assert(0);

     return NULL;

@@ -2424,1098 +641,8 @@

   vp9_init_neighbors();

   init_bit_trees();

   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_tokens_from_tree(vp9_nzc4x4_encodings, vp9_nzc4x4_tree);

-  vp9_tokens_from_tree(vp9_nzc8x8_encodings, vp9_nzc8x8_tree);

-  vp9_tokens_from_tree(vp9_nzc16x16_encodings, vp9_nzc16x16_tree);

-  vp9_tokens_from_tree(vp9_nzc32x32_encodings, vp9_nzc32x32_tree);

-#endif

-#if CONFIG_CODE_NONZEROCOUNT

-#define mb_in_cur_tile(cm, mb_row, mb_col)      \

-    ((mb_col) >= (cm)->cur_tile_mb_col_start && \

-     (mb_col) <= (cm)->cur_tile_mb_col_end   && \

-     (mb_row) >= 0)

-#define choose_nzc_context(nzc_exp, t2, t1)     \

-    ((nzc_exp) >= (t2) ? 2 : (nzc_exp) >= (t1) ? 1 : 0)

-#define NZC_T2_32X32    (16 << 6)

-#define NZC_T1_32X32     (4 << 6)

-#define NZC_T2_16X16    (12 << 6)

-#define NZC_T1_16X16     (3 << 6)

-#define NZC_T2_8X8       (8 << 6)

-#define NZC_T1_8X8       (2 << 6)

-#define NZC_T2_4X4       (4 << 6)

-#define NZC_T1_4X4       (1 << 6)

-// Transforms a mb16 block index to a sb64 block index

-static inline int mb16_to_sb64_index(int mb_row, int mb_col, int block) {

-  int r = (mb_row & 3);

-  int c = (mb_col & 3);

-  int b;

-  if (block < 16) {  // Y

-    int ib = block >> 2;

-    int jb = block & 3;

-    ib += r * 4;

-    jb += c * 4;

-    b = ib * 16 + jb;

-    assert(b < 256);

-    return b;

-  } else {  // UV

-    int base = block - (block & 3);

-    int ib = (block - base) >> 1;

-    int jb = (block - base) & 1;

-    ib += r * 2;

-    jb += c * 2;

-    b = base * 16 + ib * 8 + jb;

-    assert(b >= 256 && b < 384);

-    return b;

-  }

-}

-// Transforms a mb16 block index to a sb32 block index

-static inline int mb16_to_sb32_index(int mb_row, int mb_col, int block) {

-  int r = (mb_row & 1);

-  int c = (mb_col & 1);

-  int b;

-  if (block < 16) {  // Y

-    int ib = block >> 2;

-    int jb = block & 3;

-    ib += r * 4;

-    jb += c * 4;

-    b = ib * 8 + jb;

-    assert(b < 64);

-    return b;

-  } else {  // UV

-    int base = block - (block & 3);

-    int ib = (block - base) >> 1;

-    int jb = (block - base) & 1;

-    ib += r * 2;

-    jb += c * 2;

-    b = base * 4 + ib * 4 + jb;

-    assert(b >= 64 && b < 96);

-    return b;

-  }

-}

-static inline int block_to_txfm_index(int block, TX_SIZE tx_size, int s) {

-  // s is the log of the number of 4x4 blocks in each row/col of larger block

-  int b, ib, jb, nb;

-  ib = block >> s;

-  jb = block - (ib << s);

-  ib >>= tx_size;

-  jb >>= tx_size;

-  nb = 1 << (s - tx_size);

-  b = (ib * nb + jb) << (2 * tx_size);

-  return b;

-}

-/* BEGIN - Helper functions to get the y nzcs */

-static unsigned int get_nzc_4x4_y_sb64(MB_MODE_INFO *mi, int block) {

-  int b;

-  assert(block < 256);

-  b = block_to_txfm_index(block, mi->txfm_size, 4);

-  assert(b < 256);

-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);

-}

-static unsigned int get_nzc_4x4_y_sb32(MB_MODE_INFO *mi, int block) {

-  int b;

-  assert(block < 64);

-  b = block_to_txfm_index(block, mi->txfm_size, 3);

-  assert(b < 64);

-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);

-}

-static unsigned int get_nzc_4x4_y_mb16(MB_MODE_INFO *mi, int block) {

-  int b;

-  assert(block < 16);

-  b = block_to_txfm_index(block, mi->txfm_size, 2);

-  assert(b < 16);

-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);

-}

-/* END - Helper functions to get the y nzcs */

-/* Function to get y nzc where block index is in mb16 terms */

-static unsigned int get_nzc_4x4_y(VP9_COMMON *cm, MODE_INFO *m,

-                                  int mb_row, int mb_col, int block) {

-  // NOTE: All values returned are at 64 times the true value at 4x4 scale

-  MB_MODE_INFO *const mi = &m->mbmi;

-  const int mis = cm->mode_info_stride;

-  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))

-    return 0;

-  if (mi->sb_type == BLOCK_SIZE_SB64X64) {

-    int r = mb_row & 3;

-    int c = mb_col & 3;

-    m -= c + r * mis;

-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

-      return 0;

-    else

-      return get_nzc_4x4_y_sb64(

-          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));

-  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {

-    int r = mb_row & 1;

-    int c = mb_col & 1;

-    m -= c + r * mis;

-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

-      return 0;

-    else

-      return get_nzc_4x4_y_sb32(

-          &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));

-  } else {

-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))

-      return 0;

-    return get_nzc_4x4_y_mb16(mi, block);

-  }

-}

-/* BEGIN - Helper functions to get the uv nzcs */

-static unsigned int get_nzc_4x4_uv_sb64(MB_MODE_INFO *mi, int block) {

-  int b;

-  int base, uvtxfm_size;

-  assert(block >= 256 && block < 384);

-  uvtxfm_size = mi->txfm_size;

-  base = 256 + (block & 64);

-  block -= base;

-  b = base + block_to_txfm_index(block, uvtxfm_size, 3);

-  assert(b >= 256 && b < 384);

-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);

-}

-static unsigned int get_nzc_4x4_uv_sb32(MB_MODE_INFO *mi, int block) {

-  int b;

-  int base, uvtxfm_size;

-  assert(block >= 64 && block < 96);

-  if (mi->txfm_size == TX_32X32)

-    uvtxfm_size = TX_16X16;

-  else

-    uvtxfm_size = mi->txfm_size;

-  base = 64 + (block & 16);

-  block -= base;

-  b = base + block_to_txfm_index(block, uvtxfm_size, 2);

-  assert(b >= 64 && b < 96);

-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);

-}

-static unsigned int get_nzc_4x4_uv_mb16(MB_MODE_INFO *mi, int block) {

-  int b;

-  int base, uvtxfm_size;

-  assert(block >= 16 && block < 24);

-  if (mi->txfm_size == TX_8X8 &&

-      (mi->mode == SPLITMV || mi->mode == I8X8_PRED))

-    uvtxfm_size = TX_4X4;

-  else if (mi->txfm_size == TX_16X16)

-    uvtxfm_size = TX_8X8;

-  else

-    uvtxfm_size = mi->txfm_size;

-  base = 16 + (block & 4);

-  block -= base;

-  b = base + block_to_txfm_index(block, uvtxfm_size, 1);

-  assert(b >= 16 && b < 24);

-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);

-}

-/* END - Helper functions to get the uv nzcs */

-/* Function to get uv nzc where block index is in mb16 terms */

-static unsigned int get_nzc_4x4_uv(VP9_COMMON *cm, MODE_INFO *m,

-                                   int mb_row, int mb_col, int block) {

-  // NOTE: All values returned are at 64 times the true value at 4x4 scale

-  MB_MODE_INFO *const mi = &m->mbmi;

-  const int mis = cm->mode_info_stride;

-  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))

-    return 0;

-  if (mi->sb_type == BLOCK_SIZE_SB64X64) {

-    int r = mb_row & 3;

-    int c = mb_col & 3;

-    m -= c + r * mis;

-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

-      return 0;

-    else

-      return get_nzc_4x4_uv_sb64(

-          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));

-  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {

-    int r = mb_row & 1;

-    int c = mb_col & 1;

-    m -= c + r * mis;

-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

-      return 0;

-    else

-    return get_nzc_4x4_uv_sb32(

-        &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));

-  } else {

-    return get_nzc_4x4_uv_mb16(mi, block);

-  }

-}

-int vp9_get_nzc_context_y_sb64(VP9_COMMON *cm, MODE_INFO *cur,

-                               int mb_row, int mb_col, int block) {

-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

-  // neighboring blocks are

-  int mis = cm->mode_info_stride;

-  int nzc_exp = 0;

-  TX_SIZE txfm_size = cur->mbmi.txfm_size;

-  assert(block < 256);

-  switch (txfm_size) {

-    case TX_32X32:

-      assert((block & 63) == 0);

-      if (block < 128) {

-        int o = (block >> 6) * 2;

-        nzc_exp =

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15) +

-            get_nzc_4x4_y(cm, cur - mis + o + 1,

-                          mb_row - 1, mb_col + o + 1, 12) +

-            get_nzc_4x4_y(cm, cur - mis + o + 1,

-                          mb_row - 1, mb_col + o + 1, 13) +

-            get_nzc_4x4_y(cm, cur - mis + o + 1,

-                          mb_row - 1, mb_col + o + 1, 14) +

-            get_nzc_4x4_y(cm, cur - mis + o + 1,

-                          mb_row - 1, mb_col + o + 1, 15);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 128] << 3;

-      }

-      if ((block & 127) == 0) {

-        int o = (block >> 7) * 2;

-        nzc_exp +=

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

-                          mb_row + o + 1, mb_col - 1, 3) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

-                          mb_row + o + 1, mb_col - 1, 7) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

-                          mb_row + o + 1, mb_col - 1, 11) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

-                          mb_row + o + 1, mb_col - 1, 15);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 64] << 3;

-      }

-      nzc_exp <<= 2;

-      // Note nzc_exp is 64 times the average value expected at 32x32 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);

-      break;

-    case TX_16X16:

-      assert((block & 15) == 0);

-      if (block < 64) {

-        int o = block >> 4;

-        nzc_exp =

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 64] << 4;

-      }

-      if ((block & 63) == 0) {

-        int o = block >> 6;

-        nzc_exp +=

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;

-      }

-      nzc_exp <<= 1;

-      // Note nzc_exp is 64 times the average value expected at 16x16 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

-      break;

-    case TX_8X8:

-      assert((block & 3) == 0);

-      if (block < 32) {

-        int o = block >> 3;

-        int p = ((block >> 2) & 1) ? 14 : 12;

-        nzc_exp =

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 32] << 5;

-      }

-      if ((block & 31) == 0) {

-        int o = block >> 6;

-        int p = ((block >> 5) & 1) ? 11 : 3;

-        nzc_exp +=

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

-      }

-      // Note nzc_exp is 64 times the average value expected at 8x8 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

-      break;

-    case TX_4X4:

-      if (block < 16) {

-        int o = block >> 2;

-        int p = block & 3;

-        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                                12 + p);

-      } else {

-        nzc_exp = (cur->mbmi.nzcs[block - 16] << 6);

-      }

-      if ((block & 15) == 0) {

-        int o = block >> 6;

-        int p = (block >> 4) & 3;

-        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                                 3 + 4 * p);

-      } else {

-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

-      }

-      nzc_exp >>= 1;

-      // Note nzc_exp is 64 times the average value expected at 4x4 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

-      break;

-    default:

-      return 0;

-  }

-}

-int vp9_get_nzc_context_y_sb32(VP9_COMMON *cm, MODE_INFO *cur,

-                               int mb_row, int mb_col, int block) {

-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

-  // neighboring blocks are

-  int mis = cm->mode_info_stride;

-  int nzc_exp = 0;

-  TX_SIZE txfm_size = cur->mbmi.txfm_size;

-  assert(block < 64);

-  switch (txfm_size) {

-    case TX_32X32:

-      assert(block == 0);

-      nzc_exp =

-          (get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +

-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +

-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +

-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +

-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 12) +

-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 13) +

-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 14) +

-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 15) +

-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +

-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +

-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +

-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15) +

-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 3) +

-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 7) +

-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 11) +

-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 15)) << 2;

-      // Note nzc_exp is 64 times the average value expected at 32x32 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);

-      break;

-    case TX_16X16:

-      assert((block & 15) == 0);

-      if (block < 32) {

-        int o = (block >> 4) & 1;

-        nzc_exp =

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;

-      }

-      if ((block & 31) == 0) {

-        int o = block >> 5;

-        nzc_exp +=

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;

-      }

-      nzc_exp <<= 1;

-      // Note nzc_exp is 64 times the average value expected at 16x16 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

-      break;

-    case TX_8X8:

-      assert((block & 3) == 0);

-      if (block < 16) {

-        int o = block >> 3;

-        int p = ((block >> 2) & 1) ? 14 : 12;

-        nzc_exp =

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +

-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;

-      }

-      if ((block & 15) == 0) {

-        int o = block >> 5;

-        int p = ((block >> 4) & 1) ? 11 : 3;

-        nzc_exp +=

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +

-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

-      }

-      // Note nzc_exp is 64 times the average value expected at 8x8 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

-      break;

-    case TX_4X4:

-      if (block < 8) {

-        int o = block >> 2;

-        int p = block & 3;

-        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                                12 + p);

-      } else {

-        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);

-      }

-      if ((block & 7) == 0) {

-        int o = block >> 5;

-        int p = (block >> 3) & 3;

-        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                                 3 + 4 * p);

-      } else {

-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

-      }

-      nzc_exp >>= 1;

-      // Note nzc_exp is 64 times the average value expected at 4x4 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

-      break;

-    default:

-      return 0;

-      break;

-  }

-}

-int vp9_get_nzc_context_y_mb16(VP9_COMMON *cm, MODE_INFO *cur,

-                               int mb_row, int mb_col, int block) {

-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

-  // neighboring blocks are

-  int mis = cm->mode_info_stride;

-  int nzc_exp = 0;

-  TX_SIZE txfm_size = cur->mbmi.txfm_size;

-  assert(block < 16);

-  switch (txfm_size) {

-    case TX_16X16:

-      assert(block == 0);

-      nzc_exp =

-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +

-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +

-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +

-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +

-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +

-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +

-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +

-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15);

-      nzc_exp <<= 1;

-      // Note nzc_exp is 64 times the average value expected at 16x16 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

-    case TX_8X8:

-      assert((block & 3) == 0);

-      if (block < 8) {

-        int p = ((block >> 2) & 1) ? 14 : 12;

-        nzc_exp =

-            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p) +

-            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p + 1);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;

-      }

-      if ((block & 7) == 0) {

-        int p = ((block >> 3) & 1) ? 11 : 3;

-        nzc_exp +=

-            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p) +

-            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p + 4);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

-      }

-      // Note nzc_exp is 64 times the average value expected at 8x8 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

-    case TX_4X4:

-      if (block < 4) {

-        int p = block & 3;

-        nzc_exp = get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col,

-                                12 + p);

-      } else {

-        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);

-      }

-      if ((block & 3) == 0) {

-        int p = (block >> 2) & 3;

-        nzc_exp += get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1,

-                                 3 + 4 * p);

-      } else {

-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

-      }

-      nzc_exp >>= 1;

-      // Note nzc_exp is 64 times the average value expected at 4x4 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

-    default:

-      return 0;

-      break;

-  }

-}

-int vp9_get_nzc_context_uv_sb64(VP9_COMMON *cm, MODE_INFO *cur,

-                                int mb_row, int mb_col, int block) {

-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

-  // neighboring blocks are

-  int mis = cm->mode_info_stride;

-  int nzc_exp = 0;

-  const int base = block - (block & 63);

-  const int boff = (block & 63);

-  const int base_mb16 = base >> 4;

-  TX_SIZE txfm_size = cur->mbmi.txfm_size;

-  TX_SIZE txfm_size_uv;

-  assert(block >= 256 && block < 384);

-  txfm_size_uv = txfm_size;

-  switch (txfm_size_uv) {

-    case TX_32X32:

-      assert(block == 256 || block == 320);

-      nzc_exp =

-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

-                         base_mb16 + 2) +

-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

-                         base_mb16 + 2) +

-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,

-                         base_mb16 + 2) +

-          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,

-                         base_mb16 + 2) +

-          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,

-                         base_mb16 + 1) +

-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

-                         base_mb16 + 1) +

-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,

-                         base_mb16 + 1) +

-          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,

-                         base_mb16 + 1) +

-          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,

-                         base_mb16 + 3);

-      nzc_exp <<= 2;

-      // Note nzc_exp is 64 times the average value expected at 32x32 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);

-    case TX_16X16:

-      // uv txfm_size 16x16

-      assert((block & 15) == 0);

-      if (boff < 32) {

-        int o = (boff >> 4) & 1;

-        nzc_exp =

-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                           base_mb16 + 2) +

-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                           base_mb16 + 3) +

-            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,

-                           base_mb16 + 2) +

-            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,

-                           base_mb16 + 3);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;

-      }

-      if ((boff & 31) == 0) {

-        int o = boff >> 5;

-        nzc_exp +=

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis,

-                           mb_row + o, mb_col - 1, base_mb16 + 1) +

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis,

-                           mb_row + o, mb_col - 1, base_mb16 + 3) +

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,

-                           mb_row + o + 1, mb_col - 1, base_mb16 + 1) +

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,

-                           mb_row + o + 1, mb_col - 1, base_mb16 + 3);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;

-      }

-      nzc_exp <<= 1;

-      // Note nzc_exp is 64 times the average value expected at 16x16 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

-    case TX_8X8:

-      assert((block & 3) == 0);

-      if (boff < 16) {

-        int o = boff >> 2;

-        nzc_exp =

-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                           base_mb16 + 2) +

-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                           base_mb16 + 3);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;

-      }

-      if ((boff & 15) == 0) {

-        int o = boff >> 4;

-        nzc_exp +=

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                           base_mb16 + 1) +

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                           base_mb16 + 3);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

-      }

-      // Note nzc_exp is 64 times the average value expected at 8x8 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

-    case TX_4X4:

-      if (boff < 8) {

-        int o = boff >> 1;

-        int p = boff & 1;

-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                                 base_mb16 + 2 + p);

-      } else {

-        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);

-      }

-      if ((boff & 7) == 0) {

-        int o = boff >> 4;

-        int p = (boff >> 3) & 1;

-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                                  base_mb16 + 1 + 2 * p);

-      } else {

-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

-      }

-      nzc_exp >>= 1;

-      // Note nzc_exp is 64 times the average value expected at 4x4 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

-    default:

-      return 0;

-  }

-}

-int vp9_get_nzc_context_uv_sb32(VP9_COMMON *cm, MODE_INFO *cur,

-                                int mb_row, int mb_col, int block) {

-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

-  // neighboring blocks are

-  int mis = cm->mode_info_stride;

-  int nzc_exp = 0;

-  const int base = block - (block & 15);

-  const int boff = (block & 15);

-  const int base_mb16 = base >> 2;

-  TX_SIZE txfm_size = cur->mbmi.txfm_size;

-  TX_SIZE txfm_size_uv;

-  assert(block >= 64 && block < 96);

-  if (txfm_size == TX_32X32)

-    txfm_size_uv = TX_16X16;

-  else

-    txfm_size_uv = txfm_size;

-  switch (txfm_size_uv) {

-    case TX_16X16:

-      // uv txfm_size 16x16

-      assert(block == 64 || block == 80);

-      nzc_exp =

-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

-                         base_mb16 + 2) +

-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

-                         base_mb16 + 2) +

-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,

-                         base_mb16 + 1) +

-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,

-                         base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

-                         base_mb16 + 1) +

-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

-                         base_mb16 + 3);

-      nzc_exp <<= 1;

-      // Note nzc_exp is 64 times the average value expected at 16x16 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

-      break;

-    case TX_8X8:

-      assert((block & 3) == 0);

-      if (boff < 8) {

-        int o = boff >> 2;

-        nzc_exp =

-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                           base_mb16 + 2) +

-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                           base_mb16 + 3);

-      } else {

-        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;

-      }

-      if ((boff & 7) == 0) {

-        int o = boff >> 3;

-        nzc_exp +=

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                           base_mb16 + 1) +

-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                           base_mb16 + 3);

-      } else {

-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

-      }

-      // Note nzc_exp is 64 times the average value expected at 8x8 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

-    case TX_4X4:

-      if (boff < 4) {

-        int o = boff >> 1;

-        int p = boff & 1;

-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

-                                 base_mb16 + 2 + p);

-      } else {

-        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);

-      }

-      if ((boff & 3) == 0) {

-        int o = boff >> 3;

-        int p = (boff >> 2) & 1;

-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

-                                  base_mb16 + 1 + 2 * p);

-      } else {

-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

-      }

-      nzc_exp >>= 1;

-      // Note nzc_exp is 64 times the average value expected at 4x4 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

-    default:

-      return 0;

-  }

-}

-int vp9_get_nzc_context_uv_mb16(VP9_COMMON *cm, MODE_INFO *cur,

-                                int mb_row, int mb_col, int block) {

-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

-  // neighboring blocks are

-  int mis = cm->mode_info_stride;

-  int nzc_exp = 0;

-  const int base = block - (block & 3);

-  const int boff = (block & 3);

-  const int base_mb16 = base;

-  TX_SIZE txfm_size = cur->mbmi.txfm_size;

-  TX_SIZE txfm_size_uv;

-  assert(block >= 16 && block < 24);

-  if (txfm_size == TX_16X16)

-    txfm_size_uv = TX_8X8;

-  else if (txfm_size == TX_8X8 &&

-           (cur->mbmi.mode == I8X8_PRED || cur->mbmi.mode == SPLITMV))

-    txfm_size_uv = TX_4X4;

-  else

-    txfm_size_uv = txfm_size;

-  switch (txfm_size_uv) {

-    case TX_8X8:

-      assert((block & 3) == 0);

-      nzc_exp =

-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 2) +

-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 3) +

-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 1) +

-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 3);

-      // Note nzc_exp is 64 times the average value expected at 8x8 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

-    case TX_4X4:

-      if (boff < 2) {

-        int p = boff & 1;

-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

-                                 base_mb16 + 2 + p);

-      } else {

-        nzc_exp = (cur->mbmi.nzcs[block - 2] << 6);

-      }

-      if ((boff & 1) == 0) {

-        int p = (boff >> 1) & 1;

-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,

-                                  base_mb16 + 1 + 2 * p);

-      } else {

-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

-      }

-      nzc_exp >>= 1;

-      // Note nzc_exp is 64 times the average value expected at 4x4 scale

-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

-    default:

-      return 0;

-  }

-}

-int vp9_get_nzc_context(VP9_COMMON *cm, MACROBLOCKD *xd, int block) {

-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-    assert(block < 384);

-    if (block < 256)

-      return vp9_get_nzc_context_y_sb64(cm, xd->mode_info_context,

-                                        get_mb_row(xd), get_mb_col(xd), block);

-    else

-      return vp9_get_nzc_context_uv_sb64(cm, xd->mode_info_context,

-                                         get_mb_row(xd), get_mb_col(xd), block);

-  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

-    assert(block < 96);

-    if (block < 64)

-      return vp9_get_nzc_context_y_sb32(cm, xd->mode_info_context,

-                                        get_mb_row(xd), get_mb_col(xd), block);

-    else

-      return vp9_get_nzc_context_uv_sb32(cm, xd->mode_info_context,

-                                         get_mb_row(xd), get_mb_col(xd), block);

-  } else {

-    assert(block < 64);

-    if (block < 16)

-      return vp9_get_nzc_context_y_mb16(cm, xd->mode_info_context,

-                                        get_mb_row(xd), get_mb_col(xd), block);

-    else

-      return vp9_get_nzc_context_uv_mb16(cm, xd->mode_info_context,

-                                         get_mb_row(xd), get_mb_col(xd), block);

-  }

-}

-static void update_nzc(VP9_COMMON *cm,

-                       uint16_t nzc,

-                       int nzc_context,

-                       TX_SIZE tx_size,

-                       int ref,

-                       int type) {

-  int e, c;

-  c = codenzc(nzc);

-  if (tx_size == TX_32X32)

-    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;

-  else if (tx_size == TX_16X16)

-    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;

-  else if (tx_size == TX_8X8)

-    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;

-  else if (tx_size == TX_4X4)

-    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;

-  else

-    assert(0);

-  if ((e = vp9_extranzcbits[c])) {

-    int x = nzc - vp9_basenzcvalue[c];

-    while (e--) {

-      int b = (x >> e) & 1;

-      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;

-    }

-  }

-}

-static void update_nzcs_sb64(VP9_COMMON *cm,

-                             MACROBLOCKD *xd,

-                             int mb_row,

-                             int mb_col) {

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_32X32:

-      for (j = 0; j < 256; j += 64) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);

-      }

-      for (j = 256; j < 384; j += 64) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1);

-      }

-      break;

-    case TX_16X16:

-      for (j = 0; j < 256; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);

-      }

-      for (j = 256; j < 384; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);

-      }

-      break;

-    case TX_8X8:

-      for (j = 0; j < 256; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);

-      }

-      for (j = 256; j < 384; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

-      }

-      break;

-    case TX_4X4:

-      for (j = 0; j < 256; ++j) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);

-      }

-      for (j = 256; j < 384; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

-      }

-      break;

-    default:

-      break;

-  }

-}

-static void update_nzcs_sb32(VP9_COMMON *cm,

-                            MACROBLOCKD *xd,

-                            int mb_row,

-                            int mb_col) {

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_32X32:

-      for (j = 0; j < 64; j += 64) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);

-      }

-      for (j = 64; j < 96; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);

-      }

-      break;

-    case TX_16X16:

-      for (j = 0; j < 64; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);

-      }

-      for (j = 64; j < 96; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);

-      }

-      break;

-    case TX_8X8:

-      for (j = 0; j < 64; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);

-      }

-      for (j = 64; j < 96; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

-      }

-      break;

-    case TX_4X4:

-      for (j = 0; j < 64; ++j) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);

-      }

-      for (j = 64; j < 96; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

-      }

-      break;

-    default:

-      break;

-  }

-}

-static void update_nzcs_mb16(VP9_COMMON *cm,

-                             MACROBLOCKD *xd,

-                             int mb_row,

-                             int mb_col) {

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_16X16:

-      for (j = 0; j < 16; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);

-      }

-      for (j = 16; j < 24; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

-      }

-      break;

-    case TX_8X8:

-      for (j = 0; j < 16; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);

-      }

-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {

-        for (j = 16; j < 24; ++j) {

-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

-        }

-      } else {

-        for (j = 16; j < 24; j += 4) {

-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

-        }

-      }

-      break;

-    case TX_4X4:

-      for (j = 0; j < 16; ++j) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);

-      }

-      for (j = 16; j < 24; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

-      }

-      break;

-    default:

-      break;

-  }

-}

-void vp9_update_nzc_counts(VP9_COMMON *cm,

-                           MACROBLOCKD *xd,

-                           int mb_row,

-                           int mb_col) {

-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64)

-    update_nzcs_sb64(cm, xd, mb_row, mb_col);

-  else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32)

-    update_nzcs_sb32(cm, xd, mb_row, mb_col);

-  else

-    update_nzcs_mb16(cm, xd, mb_row, mb_col);

-}

-#endif  // CONFIG_CODE_NONZEROCOUNT

 // #define COEF_COUNT_TESTING

 #define COEF_COUNT_SAT 24

@@ -3525,34 +652,61 @@

 #define COEF_COUNT_SAT_AFTER_KEY 24

 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128

-static void adapt_coef_probs(vp9_coeff_probs *dst_coef_probs,

-                             vp9_coeff_probs *pre_coef_probs,

-                             int block_types, vp9_coeff_count *coef_counts,

-                             unsigned int (*eob_branch_count)[REF_TYPES]

-                                                             [COEF_BANDS]

-                                                      [PREV_COEF_CONTEXTS],

+void vp9_full_to_model_count(unsigned int *model_count,

+                             unsigned int *full_count) {

+  int n;

+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];

+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];

+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];

+  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)

+    model_count[TWO_TOKEN] += full_count[n];

+  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];

+}

+void vp9_full_to_model_counts(

+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {

+  int i, j, k, l;

+  for (i = 0; i < BLOCK_TYPES; ++i)

+    for (j = 0; j < REF_TYPES; ++j)

+      for (k = 0; k < COEF_BANDS; ++k)

+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

+          if (l >= 3 && k == 0)

+            continue;

+          vp9_full_to_model_count(model_count[i][j][k][l],

+                                  full_count[i][j][k][l]);

+        }

+}

+static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,

                              int count_sat, int update_factor) {

+  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];

+  vp9_coeff_probs_model *pre_coef_probs = cm->fc.pre_coef_probs[txfm_size];

+  vp9_coeff_count_model *coef_counts = cm->fc.coef_counts[txfm_size];

+  unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =

+      cm->fc.eob_branch_counts[txfm_size];

   int t, i, j, k, l, count;

-  unsigned int branch_ct[ENTROPY_NODES][2];

-  vp9_prob coef_probs[ENTROPY_NODES];

   int factor;

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT

-  int entropy_nodes_adapt = UNCONSTRAINED_ADAPT_NODES;

-#else

-  int entropy_nodes_adapt = ENTROPY_NODES;

-#endif

+  unsigned int branch_ct[UNCONSTRAINED_NODES][2];

+  vp9_prob coef_probs[UNCONSTRAINED_NODES];

+  int entropy_nodes_adapt = UNCONSTRAINED_NODES;

-  for (i = 0; i < block_types; ++i)

+  for (i = 0; i < BLOCK_TYPES; ++i)

     for (j = 0; j < REF_TYPES; ++j)

       for (k = 0; k < COEF_BANDS; ++k)

         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

           if (l >= 3 && k == 0)

             continue;

-          vp9_tree_probs_from_distribution(vp9_coef_tree,

-                                           coef_probs, branch_ct,

-                                           coef_counts[i][j][k][l], 0);

+          vp9_tree_probs_from_distribution(

+              vp9_coefmodel_tree,

+              coef_probs, branch_ct,

+              coef_counts[i][j][k][l], 0);

+#if CONFIG_BALANCED_COEFTREE

+          branch_ct[1][1] = eob_branch_count[i][j][k][l] - branch_ct[1][0];

+          coef_probs[1] = get_binary_prob(branch_ct[1][0], branch_ct[1][1]);

+#else

           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];

           coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);

+#endif

           for (t = 0; t < entropy_nodes_adapt; ++t) {

             count = branch_ct[t][0] + branch_ct[t][1];

             count = count > count_sat ? count_sat : count;

@@ -3560,21 +714,16 @@

             dst_coef_probs[i][j][k][l][t] =

                 weighted_prob(pre_coef_probs[i][j][k][l][t],

                               coef_probs[t], factor);

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT

-            if (t == UNCONSTRAINED_NODES - 1)

-              vp9_get_model_distribution(

-                  dst_coef_probs[i][j][k][l][UNCONSTRAINED_NODES - 1],

-                  dst_coef_probs[i][j][k][l], i, j);

-#endif

 void vp9_adapt_coef_probs(VP9_COMMON *cm) {

+  TX_SIZE t;

   int count_sat;

   int update_factor; /* denominator 256 */

-  if (cm->frame_type == KEY_FRAME) {

+  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {

     update_factor = COEF_MAX_UPDATE_FACTOR_KEY;

     count_sat = COEF_COUNT_SAT_KEY;

   } else if (cm->last_frame_type == KEY_FRAME) {

@@ -3584,142 +733,6 @@

     update_factor = COEF_MAX_UPDATE_FACTOR;

     count_sat = COEF_COUNT_SAT;

-  adapt_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,

-                   BLOCK_TYPES, cm->fc.coef_counts_4x4,

-                   cm->fc.eob_branch_counts[TX_4X4],

-                   count_sat, update_factor);

-  adapt_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,

-                   BLOCK_TYPES, cm->fc.coef_counts_8x8,

-                   cm->fc.eob_branch_counts[TX_8X8],

-                   count_sat, update_factor);

-  adapt_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,

-                   BLOCK_TYPES, cm->fc.coef_counts_16x16,

-                   cm->fc.eob_branch_counts[TX_16X16],

-                   count_sat, update_factor);

-  adapt_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,

-                   BLOCK_TYPES, cm->fc.coef_counts_32x32,

-                   cm->fc.eob_branch_counts[TX_32X32],

-                   count_sat, update_factor);

+  for (t = TX_4X4; t <= TX_32X32; t++)

+    adapt_coef_probs(cm, t, count_sat, update_factor);

-#if CONFIG_CODE_NONZEROCOUNT

-static void adapt_nzc_probs(VP9_COMMON *cm,

-                            int block_size,

-                            int count_sat,

-                            int update_factor) {

-  int c, r, b, n;

-  int count, factor;

-  unsigned int nzc_branch_ct[NZC32X32_NODES][2];

-  vp9_prob nzc_probs[NZC32X32_NODES];

-  int tokens, nodes;

-  const vp9_tree_index *nzc_tree;

-  vp9_prob *dst_nzc_probs;

-  vp9_prob *pre_nzc_probs;

-  unsigned int *nzc_counts;

-  if (block_size == 32) {

-    tokens = NZC32X32_TOKENS;

-    nzc_tree = vp9_nzc32x32_tree;

-    dst_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];

-    pre_nzc_probs = cm->fc.pre_nzc_probs_32x32[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];

-  } else if (block_size == 16) {

-    tokens = NZC16X16_TOKENS;

-    nzc_tree = vp9_nzc16x16_tree;

-    dst_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];

-    pre_nzc_probs = cm->fc.pre_nzc_probs_16x16[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];

-  } else if (block_size == 8) {

-    tokens = NZC8X8_TOKENS;

-    nzc_tree = vp9_nzc8x8_tree;

-    dst_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];

-    pre_nzc_probs = cm->fc.pre_nzc_probs_8x8[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];

-  } else {

-    nzc_tree = vp9_nzc4x4_tree;

-    tokens = NZC4X4_TOKENS;

-    dst_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];

-    pre_nzc_probs = cm->fc.pre_nzc_probs_4x4[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];

-  }

-  nodes = tokens - 1;

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)

-    for (r = 0; r < REF_TYPES; ++r)

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

-        int offset_nodes = offset * nodes;

-        int offset_tokens = offset * tokens;

-        vp9_tree_probs_from_distribution(nzc_tree,

-                                         nzc_probs, nzc_branch_ct,

-                                         nzc_counts + offset_tokens, 0);

-        for (n = 0; n < nodes; ++n) {

-          count = nzc_branch_ct[n][0] + nzc_branch_ct[n][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          dst_nzc_probs[offset_nodes + n] =

-              weighted_prob(pre_nzc_probs[offset_nodes + n],

-                            nzc_probs[n], factor);

-        }

-      }

-}

-static void adapt_nzc_pcat(VP9_COMMON *cm, int count_sat, int update_factor) {

-  int c, t;

-  int count, factor;

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

-      int b;

-      for (b = 0; b < bits; ++b) {

-        vp9_prob prob = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],

-                                        cm->fc.nzc_pcat_counts[c][t][b][1]);

-        count = cm->fc.nzc_pcat_counts[c][t][b][0] +

-                cm->fc.nzc_pcat_counts[c][t][b][1];

-        count = count > count_sat ? count_sat : count;

-        factor = (update_factor * count / count_sat);

-        cm->fc.nzc_pcat_probs[c][t][b] = weighted_prob(

-            cm->fc.pre_nzc_pcat_probs[c][t][b], prob, factor);

-      }

-    }

-  }

-}

-// #define NZC_COUNT_TESTING

-void vp9_adapt_nzc_probs(VP9_COMMON *cm) {

-  int count_sat;

-  int update_factor; /* denominator 256 */

-#ifdef NZC_COUNT_TESTING

-  int c, r, b, t;

-  printf("\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        printf("    {");

-        for (t = 0; t < NZC4X4_TOKENS; ++t) {

-          printf(" %d,", cm->fc.nzc_counts_4x4[c][r][b][t]);

-        }

-        printf("}\n");

-      }

-      printf("\n");

-    }

-#endif

-  if (cm->frame_type == KEY_FRAME) {

-    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;

-    count_sat = COEF_COUNT_SAT_KEY;

-  } else if (cm->last_frame_type == KEY_FRAME) {

-    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */

-    count_sat = COEF_COUNT_SAT_AFTER_KEY;

-  } else {

-    update_factor = COEF_MAX_UPDATE_FACTOR;

-    count_sat = COEF_COUNT_SAT;

-  }

-  adapt_nzc_probs(cm, 4, count_sat, update_factor);

-  adapt_nzc_probs(cm, 8, count_sat, update_factor);

-  adapt_nzc_probs(cm, 16, count_sat, update_factor);

-  adapt_nzc_probs(cm, 32, count_sat, update_factor);

-  adapt_nzc_pcat(cm, count_sat, update_factor);

-}

-#endif  // CONFIG_CODE_NONZEROCOUNT

--- a/vp9/common/vp9_entropy.h

+++ b/vp9/common/vp9_entropy.h

@@ -16,8 +16,6 @@

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_common.h"

-extern const int vp9_i8x8_block[4];

 /* Coefficient token alphabet */

 #define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */

@@ -40,16 +38,19 @@

 extern const vp9_tree_index vp9_coef_tree[];

-extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];

+#define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */

+extern const vp9_tree_index vp9_coefmodel_tree[];

+extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];

 typedef struct {

   vp9_tree_p tree;

   const vp9_prob *prob;

-  int Len;

+  int len;

   int base_val;

-} vp9_extra_bit_struct;

+} vp9_extra_bit;

-extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */

+extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */

 #define PROB_UPDATE_BASELINE_COST   7

@@ -84,6 +85,8 @@

 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */

 #define PREV_COEF_CONTEXTS          6

+// #define ENTROPY_STATS

 typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

                                     [MAX_ENTROPY_TOKENS];

 typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

@@ -96,173 +99,126 @@

 struct VP9Common;

 void vp9_default_coef_probs(struct VP9Common *);

-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]);

+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]);

 extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);

 extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);

-extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);

+extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]);

 extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]);

 extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]);

-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);

+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]);

 extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]);

 extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]);

-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]);

+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]);

 void vp9_coef_tree_initialize(void);

 void vp9_adapt_coef_probs(struct VP9Common *);

-static INLINE void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {

+static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd,

+                                               BLOCK_SIZE_TYPE bsize) {

   /* Clear entropy contexts */

-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

+  const int bw = 1 << b_width_log2(bsize);

+  const int bh = 1 << b_height_log2(bsize);

+  int i;

+  for (i = 0; i < MAX_MB_PLANE; i++) {

+    vpx_memset(xd->plane[i].above_context, 0,

+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x);

+    vpx_memset(xd->plane[i].left_context, 0,

+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y);

+  }

-static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd) {

-  /* Clear entropy contexts */

-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

-}

+// This is the index in the scan order beyond which all coefficients for

+// 8x8 transform and above are in the top band.

+// For 4x4 blocks the index is less but to keep things common the lookup

+// table for 4x4 is padded out to this index.

+#define MAXBAND_INDEX 21

-static INLINE void vp9_reset_sb64_tokens_context(MACROBLOCKD* const xd) {

-  /* Clear entropy contexts */

-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

-}

+extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];

+extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];

-extern const int vp9_coef_bands8x8[64];

-extern const int vp9_coef_bands4x4[16];

-static int get_coef_band(const int *scan, TX_SIZE tx_size, int coef_index) {

-  if (tx_size == TX_4X4) {

-    return vp9_coef_bands4x4[scan[coef_index]];

-  } else {

-    const int pos = scan[coef_index];

-    const int sz = 1 << (2 + tx_size);

-    const int x = pos & (sz - 1), y = pos >> (2 + tx_size);

-    if (x >= 8 || y >= 8)

-      return 5;

-    else

-      return vp9_coef_bands8x8[y * 8 + x];

-  }

+static int get_coef_band(const uint8_t * band_translate, int coef_index) {

+  return (coef_index > MAXBAND_INDEX)

+    ? (COEF_BANDS-1) : band_translate[coef_index];

 extern int vp9_get_coef_context(const int *scan, const int *neighbors,

                                 int nb_pad, uint8_t *token_cache, int c, int l);

 const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);

-#if CONFIG_MODELCOEFPROB

-#define COEFPROB_BITS               8

-#define COEFPROB_MODELS             (1 << COEFPROB_BITS)

-// 2 => EOB and Zero nodes are unconstrained, rest are modeled

-// 3 => EOB, Zero and One nodes are unconstrained, rest are modeled

-#define UNCONSTRAINED_NODES         3   // Choose one of 2 or 3

+// 128 lists of probabilities are stored for the following ONE node probs:

+// 1, 3, 5, 7, ..., 253, 255

+// In between probabilities are interpolated linearly

-// whether forward updates are model-based

-#define MODEL_BASED_UPDATE          0

-// if model-based how many nodes are unconstrained

-#define UNCONSTRAINED_UPDATE_NODES  3

-// whether backward updates are model-based

-#define MODEL_BASED_ADAPT           0

-#define UNCONSTRAINED_ADAPT_NODES   3

+#define COEFPROB_MODELS             128

-// whether to adjust the coef probs for key frames based on qindex

-#define ADJUST_KF_COEF_PROBS        0

+#define UNCONSTRAINED_NODES         3

+#define MODEL_NODES                 (ENTROPY_NODES - UNCONSTRAINED_NODES)

+#define PIVOT_NODE                  2   // which node is pivot

 typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]

-                                      [PREV_COEF_CONTEXTS][2];

-extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];

-void vp9_get_model_distribution(vp9_prob model, vp9_prob *tree_probs,

-                                int b, int r);

-void vp9_adjust_default_coef_probs(struct VP9Common *cm);

-#endif  // CONFIG_MODELCOEFPROB

+                                      [PREV_COEF_CONTEXTS]

+                                      [UNCONSTRAINED_NODES];

-#if CONFIG_CODE_NONZEROCOUNT

-/* Alphabet for number of non-zero symbols in block */

-#define NZC_0                   0       /* Used for all blocks */

-#define NZC_1                   1       /* Used for all blocks */

-#define NZC_2                   2       /* Used for all blocks */

-#define NZC_3TO4                3       /* Used for all blocks */

-#define NZC_5TO8                4       /* Used for all blocks */

-#define NZC_9TO16               5       /* Used for all blocks */

-#define NZC_17TO32              6       /* Used for 8x8 and larger blocks */

-#define NZC_33TO64              7       /* Used for 8x8 and larger blocks */

-#define NZC_65TO128             8       /* Used for 16x16 and larger blocks */

-#define NZC_129TO256            9       /* Used for 16x16 and larger blocks */

-#define NZC_257TO512           10       /* Used for 32x32 and larger blocks */

-#define NZC_513TO1024          11       /* Used for 32x32 and larger blocks */

+typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]

+                                          [PREV_COEF_CONTEXTS]

+                                          [UNCONSTRAINED_NODES + 1];

+typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]

+                                          [PREV_COEF_CONTEXTS]

+                                          [UNCONSTRAINED_NODES][2];

+extern void vp9_full_to_model_count(unsigned int *model_count,

+                                    unsigned int *full_count);

+extern void vp9_full_to_model_counts(

+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);

-/* Number of tokens for each block size */

-#define NZC4X4_TOKENS           6

-#define NZC8X8_TOKENS           8

-#define NZC16X16_TOKENS        10

-#define NZC32X32_TOKENS        12

+void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);

-/* Number of nodes for each block size */

-#define NZC4X4_NODES            5

-#define NZC8X8_NODES            7

-#define NZC16X16_NODES          9

-#define NZC32X32_NODES         11

+void vp9_model_to_full_probs_sb(

+    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],

+    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]);

-/* Max number of tokens with extra bits */

-#define NZC_TOKENS_EXTRA        9

+extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];

-/* Max number of extra bits */

-#define NZC_BITS_EXTRA          9

+static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {

+  switch (tx_type) {

+    case ADST_DCT:

+      return vp9_row_scan_4x4;

+    case DCT_ADST:

+      return vp9_col_scan_4x4;

+    default:

+      return vp9_default_scan_4x4;

+  }

+}

-/* Tokens without extra bits */

-#define NZC_TOKENS_NOEXTRA      (NZC32X32_TOKENS - NZC_TOKENS_EXTRA)

+static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {

+  switch (tx_type) {

+    case ADST_DCT:

+      return vp9_row_scan_8x8;

+    case DCT_ADST:

+      return vp9_col_scan_8x8;

+    default:

+      return vp9_default_scan_8x8;

+  }

+}

-#define MAX_NZC_CONTEXTS        3

-/* whether to update extra bit probabilities */

-#define NZC_PCAT_UPDATE

-/* nzc trees */

-extern const vp9_tree_index    vp9_nzc4x4_tree[];

-extern const vp9_tree_index    vp9_nzc8x8_tree[];

-extern const vp9_tree_index    vp9_nzc16x16_tree[];

-extern const vp9_tree_index    vp9_nzc32x32_tree[];

-/* nzc encodings */

-extern struct vp9_token_struct  vp9_nzc4x4_encodings[NZC4X4_TOKENS];

-extern struct vp9_token_struct  vp9_nzc8x8_encodings[NZC8X8_TOKENS];

-extern struct vp9_token_struct  vp9_nzc16x16_encodings[NZC16X16_TOKENS];

-extern struct vp9_token_struct  vp9_nzc32x32_encodings[NZC32X32_TOKENS];

-#define codenzc(x) (\

-  (x) <= 3 ? (x) : (x) <= 4 ? 3 : (x) <= 8 ? 4 : \

-  (x) <= 16 ? 5 : (x) <= 32 ? 6 : (x) <= 64 ? 7 :\

-  (x) <= 128 ? 8 : (x) <= 256 ? 9 : (x) <= 512 ? 10 : 11)

-int vp9_get_nzc_context_y_sb64(struct VP9Common *cm, MODE_INFO *cur,

-                               int mb_row, int mb_col, int block);

-int vp9_get_nzc_context_y_sb32(struct VP9Common *cm, MODE_INFO *cur,

-                               int mb_row, int mb_col, int block);

-int vp9_get_nzc_context_y_mb16(struct VP9Common *cm, MODE_INFO *cur,

-                               int mb_row, int mb_col, int block);

-int vp9_get_nzc_context_uv_sb64(struct VP9Common *cm, MODE_INFO *cur,

-                                int mb_row, int mb_col, int block);

-int vp9_get_nzc_context_uv_sb32(struct VP9Common *cm, MODE_INFO *cur,

-                                int mb_row, int mb_col, int block);

-int vp9_get_nzc_context_uv_mb16(struct VP9Common *cm, MODE_INFO *cur,

-                                int mb_row, int mb_col, int block);

-int vp9_get_nzc_context(struct VP9Common *cm, MACROBLOCKD *xd, int block);

-void vp9_update_nzc_counts(struct VP9Common *cm, MACROBLOCKD *xd,

-                           int mb_row, int mb_col);

-void vp9_adapt_nzc_probs(struct VP9Common *cm);

-/* Extra bits array */

-extern const int vp9_extranzcbits[NZC32X32_TOKENS];

-/* Base nzc values */

-extern const int vp9_basenzcvalue[NZC32X32_TOKENS];

-#endif  // CONFIG_CODE_NONZEROCOUNT

+static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {

+  switch (tx_type) {

+    case ADST_DCT:

+      return vp9_row_scan_16x16;

+    case DCT_ADST:

+      return vp9_col_scan_16x16;

+    default:

+      return vp9_default_scan_16x16;

+  }

+}

 #include "vp9/common/vp9_coefupdateprobs.h"

--- a/vp9/common/vp9_entropymode.c

+++ b/vp9/common/vp9_entropymode.c

@@ -15,464 +15,274 @@

 #include "vp9/common/vp9_alloccommon.h"

 #include "vpx_mem/vpx_mem.h"

-static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {

-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */

-  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},

-  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},

-  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},

-  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},

-  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},

-  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},

-  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},

-  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},

+static const vp9_prob default_kf_uv_probs[VP9_INTRA_MODES]

+                                         [VP9_INTRA_MODES - 1] = {

+  { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,

+  { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,

+  { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,

+  { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,

+  { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,

+  { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,

+  { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,

+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,

+  { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,

+  { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */

};

-static const unsigned int y_mode_cts  [VP9_YMODES] = {

-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */

-  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70

+static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]

+                                        [VP9_INTRA_MODES - 1] = {

+  {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* block_size < 8x8 */,

+  { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* block_size < 16x16 */,

+  { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* block_size < 32x32 */,

+  { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* block_size >= 32x32 */

};

-static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {

-  /* DC   V   H  D45 135 117 153 D27 D63 TM */

-  { 200, 15, 15, 10, 10, 10, 10, 10, 10,  6}, /* DC */

-  { 130, 75, 10, 10, 10, 10, 10, 10, 10,  6}, /* V */

-  { 130, 10, 75, 10, 10, 10, 10, 10, 10,  6}, /* H */

-  { 130, 15, 10, 75, 10, 10, 10, 10, 10,  6}, /* D45 */

-  { 150, 15, 10, 10, 75, 10, 10, 10, 10,  6}, /* D135 */

-  { 150, 15, 10, 10, 10, 75, 10, 10, 10,  6}, /* D117 */

-  { 150, 15, 10, 10, 10, 10, 75, 10, 10,  6}, /* D153 */

-  { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */

-  { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */

-  { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */

-  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */

-  { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */

+static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]

+                                         [VP9_INTRA_MODES - 1] = {

+  { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,

+  {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,

+  {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,

+  {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,

+  {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,

+  {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,

+  {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,

+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,

+  {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,

+  { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */

};

-static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {

-  /* DC V   H D45 135 117 153 D27 D63  TM */

-  73, 49, 61, 30, 30, 30, 30, 30, 30, 13

+const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]

+                                  [PARTITION_TYPES - 1] = {

+  { /* frame_type = keyframe */

+    /* 8x8 -> 4x4 */

+    { 158,  97,  94 } /* a/l both not split */,

+    {  93,  24,  99 } /* a split, l not split */,

+    {  85, 119,  44 } /* l split, a not split */,

+    {  62,  59,  67 } /* a/l both split */,

+    /* 16x16 -> 8x8 */

+    { 149,  53,  53 } /* a/l both not split */,

+    {  94,  20,  48 } /* a split, l not split */,

+    {  83,  53,  24 } /* l split, a not split */,

+    {  52,  18,  18 } /* a/l both split */,

+    /* 32x32 -> 16x16 */

+    { 150,  40,  39 } /* a/l both not split */,

+    {  78,  12,  26 } /* a split, l not split */,

+    {  67,  33,  11 } /* l split, a not split */,

+    {  24,   7,   5 } /* a/l both split */,

+    /* 64x64 -> 32x32 */

+    { 174,  35,  49 } /* a/l both not split */,

+    {  68,  11,  27 } /* a split, l not split */,

+    {  57,  15,   9 } /* l split, a not split */,

+    {  12,   3,   3 } /* a/l both split */

+  }, { /* frame_type = interframe */

+    /* 8x8 -> 4x4 */

+    { 199, 122, 141 } /* a/l both not split */,

+    { 147,  63, 159 } /* a split, l not split */,

+    { 148, 133, 118 } /* l split, a not split */,

+    { 121, 104, 114 } /* a/l both split */,

+    /* 16x16 -> 8x8 */

+    { 174,  73,  87 } /* a/l both not split */,

+    {  92,  41,  83 } /* a split, l not split */,

+    {  82,  99,  50 } /* l split, a not split */,

+    {  53,  39,  39 } /* a/l both split */,

+    /* 32x32 -> 16x16 */

+    { 177,  58,  59 } /* a/l both not split */,

+    {  68,  26,  63 } /* a split, l not split */,

+    {  52,  79,  25 } /* l split, a not split */,

+    {  17,  14,  12 } /* a/l both split */,

+    /* 64x64 -> 32x32 */

+    { 222,  34,  30 } /* a/l both not split */,

+    {  72,  16,  44 } /* a split, l not split */,

+    {  58,  32,  12 } /* l split, a not split */,

+    {  10,   7,   6 } /* a/l both split */

+  }

};

-static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {

-  // DC   V   H  D45 135 117 153 D27 D63 TM

-  { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */

-  { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */

-  { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */

-  { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */

-  { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */

-  { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */

-  { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */

-  { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */

-  { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */

-  { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */

-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */

-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */

+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */

+const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {

+  -DC_PRED, 2,                      /* 0 = DC_NODE */

+  -TM_PRED, 4,                      /* 1 = TM_NODE */

+  -V_PRED, 6,                       /* 2 = V_NODE */

+  8, 12,                            /* 3 = COM_NODE */

+  -H_PRED, 10,                      /* 4 = H_NODE */

+  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */

+  -D45_PRED, 14,                    /* 6 = D45_NODE */

+  -D63_PRED, 16,                    /* 7 = D63_NODE */

+  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */

};

-static const unsigned int bmode_cts[VP9_NKF_BINTRAMODES] = {

-#if CONFIG_NEWBINTRAMODES

-#if CONTEXT_PRED_REPLACEMENTS == 6

-  /* DC    TM     VE     HE   CONTEXT */

-  43891, 17694, 10036, 3920, 20000

-#elif CONTEXT_PRED_REPLACEMENTS == 4

-  /* DC    TM     VE     HE   LD    RD   CONTEXT */

-  43891, 17694, 10036, 3920, 3363, 2546, 14000

-#elif CONTEXT_PRED_REPLACEMENTS == 0

-  /* DC    TM     VE     HE   LD    RD   VR    VL    HD    HU   CONTEXT */

-  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723, 50000

-#endif

-#else

-  /* DC    TM     VE     HE   LD    RD    VR    VL    HD    HU */

-  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723

-#endif

+const vp9_tree_index vp9_sb_mv_ref_tree[6] = {

+  -ZEROMV, 2,

+  -NEARESTMV, 4,

+  -NEARMV, -NEWMV

};

-typedef enum {

-  SUBMVREF_NORMAL,

-  SUBMVREF_LEFT_ZED,

-  SUBMVREF_ABOVE_ZED,

-  SUBMVREF_LEFT_ABOVE_SAME,

-  SUBMVREF_LEFT_ABOVE_ZED

-} sumvfref_t;

+const vp9_tree_index vp9_partition_tree[6] = {

+  -PARTITION_NONE, 2,

+  -PARTITION_HORZ, 4,

+  -PARTITION_VERT, -PARTITION_SPLIT

+};

-int vp9_mv_cont(const int_mv *l, const int_mv *a) {

-  int lez = (l->as_int == 0);

-  int aez = (a->as_int == 0);

-  int lea = (l->as_int == a->as_int);

+struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];

-  if (lea && lez)

-    return SUBMVREF_LEFT_ABOVE_ZED;

+struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];

-  if (lea)

-    return SUBMVREF_LEFT_ABOVE_SAME;

+struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

-  if (aez)

-    return SUBMVREF_ABOVE_ZED;

-  if (lez)

-    return SUBMVREF_LEFT_ZED;

-  return SUBMVREF_NORMAL;

-}

-const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {

-  { 147, 136, 18 },

-  { 106, 145, 1  },

-  { 179, 121, 1  },

-  { 223, 1, 34 },

-  { 208, 1, 1  }

+static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {

+  9, 102, 187, 225

};

-vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {

-  {

-    0,  0,  0,  0,

-    0,  0,  0,  0,

-    1,  1,  1,  1,

-    1,  1,  1,  1,

-  }, {

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-  }, {

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-    2,  2,  3,  3,

-    2,  2,  3,  3,

-  }, {

-    0,  1,  2,  3,

-    4,  5,  6,  7,

-    8,  9,  10, 11,

-    12, 13, 14, 15,

-  },

+static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {

+  239, 183, 119,  96,  41

};

-const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};

-const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};

-/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */

-const vp9_tree_index vp9_kf_bmode_tree[VP9_KF_BINTRAMODES * 2 - 2] = {

-  -B_DC_PRED, 2,                      /* 0 = DC_NODE */

-  -B_TM_PRED, 4,                      /* 1 = TM_NODE */

-  -B_VE_PRED, 6,                      /* 2 = VE_NODE */

-  8, 12,                              /* 3 = COM_NODE */

-  -B_HE_PRED, 10,                     /* 4 = HE_NODE */

-  -B_RD_PRED, -B_VR_PRED,             /* 5 = RD_NODE */

-  -B_LD_PRED, 14,                     /* 6 = LD_NODE */

-  -B_VL_PRED, 16,                     /* 7 = VL_NODE */

-  -B_HD_PRED, -B_HU_PRED              /* 8 = HD_NODE */

+static const vp9_prob default_comp_ref_p[REF_CONTEXTS] = {

+  50, 126, 123, 221, 226

};

-const vp9_tree_index vp9_bmode_tree[VP9_NKF_BINTRAMODES * 2 - 2] = {

-#if CONFIG_NEWBINTRAMODES

-#if CONTEXT_PRED_REPLACEMENTS == 6

-  -B_DC_PRED, 2,

-  -B_TM_PRED, 4,

-  6, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS),

-  -B_VE_PRED, -B_HE_PRED

-#elif CONTEXT_PRED_REPLACEMENTS == 4

-  -B_DC_PRED, 2,

-  -B_TM_PRED, 4,

-  6, 8,

-  -B_VE_PRED, -B_HE_PRED,

-  10, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS),

-  -B_RD_PRED, -B_LD_PRED,

-#elif CONTEXT_PRED_REPLACEMENTS == 0

-  -B_DC_PRED, 2,                      /* 0 = DC_NODE */

-  -B_TM_PRED, 4,                      /* 1 = TM_NODE */

-  -B_VE_PRED, 6,                      /* 2 = VE_NODE */

-  8, 12,                              /* 3 = COM_NODE */

-  -B_HE_PRED, 10,                     /* 4 = HE_NODE */

-  -B_RD_PRED, -B_VR_PRED,             /* 5 = RD_NODE */

-  -B_LD_PRED, 14,                     /* 6 = LD_NODE */

-  -B_VL_PRED, 16,                     /* 7 = VL_NODE */

-  -B_HD_PRED, 18,

-  -B_HU_PRED, -B_CONTEXT_PRED

-#endif

-#else

-  -B_DC_PRED, 2,                      /* 0 = DC_NODE */

-  -B_TM_PRED, 4,                      /* 1 = TM_NODE */

-  -B_VE_PRED, 6,                      /* 2 = VE_NODE */

-  8, 12,                              /* 3 = COM_NODE */

-  -B_HE_PRED, 10,                     /* 4 = HE_NODE */

-  -B_RD_PRED, -B_VR_PRED,             /* 5 = RD_NODE */

-  -B_LD_PRED, 14,                     /* 6 = LD_NODE */

-  -B_VL_PRED, 16,                     /* 7 = VL_NODE */

-  -B_HD_PRED, -B_HU_PRED              /* 8 = HD_NODE */

-#endif

+static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {

+  {  33,  16 },

+  {  77,  74 },

+  { 142, 142 },

+  { 172, 170 },

+  { 238, 247 }

};

-/* Again, these trees use the same probability indices as their

-   explicitly-programmed predecessors. */

-const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  16, 18,

-  -V_PRED, -H_PRED,

-  -TM_PRED, 20,

-  -B_PRED, -I8X8_PRED

+const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]

+                                          [TX_SIZE_MAX_SB - 1] = {

+  { 3, 136, 37, },

+  { 5, 52, 13, },

};

-const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  16, 18,

-  -V_PRED, -H_PRED,

-  -TM_PRED, 20,

-  -B_PRED, -I8X8_PRED

+const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]

+                                          [TX_SIZE_MAX_SB - 2] = {

+  { 20, 152, },

+  { 15, 101, },

};

-const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  -V_PRED, 16,

-  -H_PRED, -TM_PRED

+const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]

+                                        [TX_SIZE_MAX_SB - 3] = {

+  { 100, },

+  { 66, },

};

-const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  -V_PRED, 16,

-  -H_PRED, -TM_PRED

-};

+void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,

+                                      unsigned int (*ct_32x32p)[2]) {

+  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];

+  ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +

+                    tx_count_32x32p[TX_16X16] +

+                    tx_count_32x32p[TX_32X32];

+  ct_32x32p[1][0] = tx_count_32x32p[TX_8X8];

+  ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] +

+                    tx_count_32x32p[TX_32X32];

+  ct_32x32p[2][0] = tx_count_32x32p[TX_16X16];

+  ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];

+}

-const vp9_tree_index vp9_mbsplit_tree[6] = {

-  -PARTITIONING_4X4,   2,

-  -PARTITIONING_8X8,   4,

-  -PARTITIONING_16X8, -PARTITIONING_8X16,

-};

+void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,

+                                      unsigned int (*ct_16x16p)[2]) {

+  ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];

+  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] +

+                    tx_count_16x16p[TX_16X16];

+  ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];

+  ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];

+}

-const vp9_tree_index vp9_mv_ref_tree[8] = {

-  -ZEROMV, 2,

-  -NEARESTMV, 4,

-  -NEARMV, 6,

-  -NEWMV, -SPLITMV

-};

+void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,

+                                    unsigned int (*ct_8x8p)[2]) {

+  ct_8x8p[0][0] =   tx_count_8x8p[TX_4X4];

+  ct_8x8p[0][1] =   tx_count_8x8p[TX_8X8];

+}

-const vp9_tree_index vp9_sb_mv_ref_tree[6] = {

-  -ZEROMV, 2,

-  -NEARESTMV, 4,

-  -NEARMV, -NEWMV

+const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = {

+  192, 128, 64

};

-const vp9_tree_index vp9_sub_mv_ref_tree[6] = {

-  -LEFT4X4, 2,

-  -ABOVE4X4, 4,

-  -ZERO4X4, -NEW4X4

-};

-struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES];

-struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES];

-struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];

-struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES];

-struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];

-struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];

-struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];

-struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];

-struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];

-struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];

-struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];

-struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];

 void vp9_init_mbmode_probs(VP9_COMMON *x) {

-  unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */

+  vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs,

+             sizeof(default_if_uv_probs));

+  vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs,

+             sizeof(default_kf_uv_probs));

+  vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs,

+             sizeof(default_if_y_probs));

-  vp9_tree_probs_from_distribution(vp9_ymode_tree, x->fc.ymode_prob,

-                                   bct, y_mode_cts, 0);

-  vp9_tree_probs_from_distribution(vp9_sb_ymode_tree, x->fc.sb_ymode_prob,

-                                   bct, y_mode_cts, 0);

-  {

-    int i;

-    for (i = 0; i < 8; i++) {

-      vp9_tree_probs_from_distribution(vp9_kf_ymode_tree, x->kf_ymode_prob[i],

-                                       bct, kf_y_mode_cts[i], 0);

-      vp9_tree_probs_from_distribution(vp9_sb_kf_ymode_tree,

-                                       x->sb_kf_ymode_prob[i], bct,

-                                       kf_y_mode_cts[i], 0);

-    }

-  }

-  {

-    int i;

-    for (i = 0; i < VP9_YMODES; i++) {

-      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->kf_uv_mode_prob[i],

-                                       bct, kf_uv_mode_cts[i], 0);

-      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->fc.uv_mode_prob[i],

-                                       bct, uv_mode_cts[i], 0);

-    }

-  }

-  vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,

-                                   bct, i8x8_mode_cts, 0);

-  vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,

-             sizeof(vp9_sub_mv_ref_prob2));

-  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));

   vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,

              sizeof(vp9_switchable_interp_prob));

-#if CONFIG_COMP_INTERINTRA_PRED

-  x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB;

-#endif

-  x->ref_pred_probs[0] = 120;

-  x->ref_pred_probs[1] = 80;

-  x->ref_pred_probs[2] = 40;

-}

+  vpx_memcpy(x->fc.partition_prob, vp9_partition_probs,

+             sizeof(vp9_partition_probs));

-static void intra_bmode_probs_from_distribution(

-  vp9_prob p[VP9_NKF_BINTRAMODES - 1],

-  unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2],

-  const unsigned int events[VP9_NKF_BINTRAMODES]) {

-  vp9_tree_probs_from_distribution(vp9_bmode_tree, p, branch_ct, events, 0);

+  vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p,

+             sizeof(default_intra_inter_p));

+  vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p,

+             sizeof(default_comp_inter_p));

+  vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p,

+             sizeof(default_comp_ref_p));

+  vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p,

+             sizeof(default_single_ref_p));

+  vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p,

+             sizeof(vp9_default_tx_probs_32x32p));

+  vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p,

+             sizeof(vp9_default_tx_probs_16x16p));

+  vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p,

+             sizeof(vp9_default_tx_probs_8x8p));

+  vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs,

+             sizeof(vp9_default_mbskip_probs));

-void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) {

-  unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2];

-  intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);

-}

-static void intra_kf_bmode_probs_from_distribution(

-  vp9_prob p[VP9_KF_BINTRAMODES - 1],

-  unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2],

-  const unsigned int events[VP9_KF_BINTRAMODES]) {

-  vp9_tree_probs_from_distribution(vp9_kf_bmode_tree, p, branch_ct, events, 0);

-}

-void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES]

-                                          [VP9_KF_BINTRAMODES]

-                                          [VP9_KF_BINTRAMODES - 1]) {

-  unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2];

-  int i, j;

-  for (i = 0; i < VP9_KF_BINTRAMODES; ++i) {

-    for (j = 0; j < VP9_KF_BINTRAMODES; ++j) {

-      intra_kf_bmode_probs_from_distribution(

-          p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);

-    }

-  }

-}

-#if VP9_SWITCHABLE_FILTERS == 3

 const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {

   -0, 2,

   -1, -2

};

-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

-#if CONFIG_ENABLE_6TAP

+struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {

-  SIXTAP, EIGHTTAP, EIGHTTAP_SHARP};

-const int vp9_switchable_interp_map[SWITCHABLE+1] = {0, -1, 1, 2, -1, -1};

-#else

-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {

   EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};

 const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};

-#endif

 const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]

                                           [VP9_SWITCHABLE_FILTERS-1] = {

-  {248, 192}, { 32, 248}, { 32,  32}, {192, 160}

+  { 235, 162, },

+  { 36, 255, },

+  { 34, 3, },

+  { 149, 144, },

};

-#elif VP9_SWITCHABLE_FILTERS == 2

-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {

-  -0, -1,

-};

-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]

-                                          [VP9_SWITCHABLE_FILTERS-1] = {

-  {248},

-  { 64},

-  {192},

-};

-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {

-  EIGHTTAP, EIGHTTAP_SHARP};

-#if CONFIG_ENABLE_6TAP

-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1, -1};

-#else

-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1};

-#endif

-#endif  // VP9_SWITCHABLE_FILTERS

 // Indicates if the filter is interpolating or non-interpolating

-// Note currently only the EIGHTTAP_SMOOTH is non-interpolating

-#if CONFIG_ENABLE_6TAP

-const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 0, 1, 1, 1, -1};

-#else

-const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {0, 1, 1, 1, -1};

-#endif

+const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 1, 1, 1, -1};

 void vp9_entropy_mode_init() {

-  vp9_tokens_from_tree(vp9_kf_bmode_encodings,   vp9_kf_bmode_tree);

-  vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);

-  vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);

-  vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);

-  vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree);

-  vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree);

-  vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);

-  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);

-  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);

+  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);

   vp9_tokens_from_tree(vp9_switchable_interp_encodings,

                        vp9_switchable_interp_tree);

+  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);

-  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,

-                              vp9_mv_ref_tree, NEARESTMV);

   vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,

                               vp9_sb_mv_ref_tree, NEARESTMV);

-  vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,

-                              vp9_sub_mv_ref_tree, LEFT4X4);

 void vp9_init_mode_contexts(VP9_COMMON *pc) {

-  vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));

-  vpx_memcpy(pc->fc.vp9_mode_contexts,

-             vp9_default_mode_contexts,

-             sizeof(vp9_default_mode_contexts));

+  vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts));

+  vpx_memcpy(pc->fc.inter_mode_probs,

+             vp9_default_inter_mode_probs,

+             sizeof(vp9_default_inter_mode_probs));

 void vp9_accum_mv_refs(VP9_COMMON *pc,

                        MB_PREDICTION_MODE m,

                        const int context) {

-  unsigned int (*mv_ref_ct)[4][2];

+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =

+      pc->fc.inter_mode_counts;

-  mv_ref_ct = pc->fc.mv_ref_ct;

   if (m == ZEROMV) {

-    ++mv_ref_ct[context][0][0];

+    ++inter_mode_counts[context][0][0];

   } else {

-    ++mv_ref_ct[context][0][1];

+    ++inter_mode_counts[context][0][1];

     if (m == NEARESTMV) {

-      ++mv_ref_ct[context][1][0];

+      ++inter_mode_counts[context][1][0];

     } else {

-      ++mv_ref_ct[context][1][1];

+      ++inter_mode_counts[context][1][1];

       if (m == NEARMV) {

-        ++mv_ref_ct[context][2][0];

+        ++inter_mode_counts[context][2][0];

       } else {

-        ++mv_ref_ct[context][2][1];

-        if (m == NEWMV) {

-          ++mv_ref_ct[context][3][0];

-        } else {

-          ++mv_ref_ct[context][3][1];

-        }

+        ++inter_mode_counts[context][2][1];

@@ -482,50 +292,35 @@

 #define MVREF_MAX_UPDATE_FACTOR 128

 void vp9_adapt_mode_context(VP9_COMMON *pc) {

   int i, j;

-  unsigned int (*mv_ref_ct)[4][2];

-  int (*mode_context)[4];

+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =

+      pc->fc.inter_mode_counts;

+  vp9_prob (*mode_context)[VP9_INTER_MODES - 1] = pc->fc.inter_mode_probs;

-  mode_context = pc->fc.vp9_mode_contexts;

-  mv_ref_ct = pc->fc.mv_ref_ct;

   for (j = 0; j < INTER_MODE_CONTEXTS; j++) {

-    for (i = 0; i < 4; i++) {

-      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1], factor;

+    for (i = 0; i < VP9_INTER_MODES - 1; i++) {

+      int count = inter_mode_counts[j][i][0] + inter_mode_counts[j][i][1];

+      int factor;

       count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;

       factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);

-      mode_context[j][i] = weighted_prob(pc->fc.vp9_mode_contexts[j][i],

-                                         get_binary_prob(mv_ref_ct[j][i][0],

-                                                         mv_ref_ct[j][i][1]),

-                                         factor);

+      mode_context[j][i] = weighted_prob(

+          pc->fc.pre_inter_mode_probs[j][i],

+          get_binary_prob(inter_mode_counts[j][i][0],

+                          inter_mode_counts[j][i][1]),

+          factor);

-#ifdef MODE_STATS

-#include "vp9/common/vp9_modecont.h"

-void print_mode_contexts(VP9_COMMON *pc) {

-  int j, i;

-  printf("\n====================\n");

-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {

-    for (i = 0; i < 4; i++) {

-      printf("%4d ", pc->fc.mode_context[j][i]);

-    }

-    printf("\n");

-  }

-  printf("====================\n");

-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {

-    for (i = 0; i < 4; i++) {

-      printf("%4d ", pc->fc.mode_context_a[j][i]);

-    }

-    printf("\n");

-  }

+#define MODE_COUNT_SAT 20

+#define MODE_MAX_UPDATE_FACTOR 128

+static int update_mode_ct(vp9_prob pre_prob, vp9_prob prob,

+                          unsigned int branch_ct[2]) {

+  int factor, count = branch_ct[0] + branch_ct[1];

+  count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+  factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+  return weighted_prob(pre_prob, prob, factor);

-#endif

-#define MODE_COUNT_SAT 20

-#define MODE_MAX_UPDATE_FACTOR 144

 static void update_mode_probs(int n_modes,

                               const vp9_tree_index *tree, unsigned int *cnt,

                               vp9_prob *pre_probs, vp9_prob *dst_probs,

@@ -533,33 +328,37 @@

 #define MAX_PROBS 32

   vp9_prob probs[MAX_PROBS];

   unsigned int branch_ct[MAX_PROBS][2];

-  int t, count, factor;

+  int t;

   assert(n_modes - 1 < MAX_PROBS);

   vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);

-  for (t = 0; t < n_modes - 1; ++t) {

-    count = branch_ct[t][0] + branch_ct[t][1];

-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-    dst_probs[t] = weighted_prob(pre_probs[t], probs[t], factor);

-  }

+  for (t = 0; t < n_modes - 1; ++t)

+    dst_probs[t] = update_mode_ct(pre_probs[t], probs[t], branch_ct[t]);

+static int update_mode_ct2(vp9_prob pre_prob, unsigned int branch_ct[2]) {

+  return update_mode_ct(pre_prob, get_binary_prob(branch_ct[0],

+                                                  branch_ct[1]), branch_ct);

+}

 // #define MODE_COUNT_TESTING

 void vp9_adapt_mode_probs(VP9_COMMON *cm) {

-  int i;

+  int i, j;

+  FRAME_CONTEXT *fc = &cm->fc;

 #ifdef MODE_COUNT_TESTING

   int t;

   printf("static const unsigned int\nymode_counts"

-         "[VP9_YMODES] = {\n");

-  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);

+         "[VP9_INTRA_MODES] = {\n");

+  for (t = 0; t < VP9_INTRA_MODES; ++t)

+    printf("%d, ", fc->ymode_counts[t]);

   printf("};\n");

   printf("static const unsigned int\nuv_mode_counts"

-         "[VP9_YMODES] [VP9_UV_MODES] = {\n");

-  for (i = 0; i < VP9_YMODES; ++i) {

+         "[VP9_INTRA_MODES] [VP9_INTRA_MODES] = {\n");

+  for (i = 0; i < VP9_INTRA_MODES; ++i) {

     printf("  {");

-    for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);

+    for (t = 0; t < VP9_INTRA_MODES; ++t)

+      printf("%d, ", fc->uv_mode_counts[i][t]);

     printf("},\n");

   printf("};\n");

@@ -566,71 +365,108 @@

   printf("static const unsigned int\nbmode_counts"

          "[VP9_NKF_BINTRAMODES] = {\n");

   for (t = 0; t < VP9_NKF_BINTRAMODES; ++t)

-    printf("%d, ", cm->fc.bmode_counts[t]);

+    printf("%d, ", fc->bmode_counts[t]);

   printf("};\n");

   printf("static const unsigned int\ni8x8_mode_counts"

          "[VP9_I8X8_MODES] = {\n");

-  for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);

+  for (t = 0; t < VP9_I8X8_MODES; ++t)

+    printf("%d, ", fc->i8x8_mode_counts[t]);

   printf("};\n");

-  printf("static const unsigned int\nsub_mv_ref_counts"

-         "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");

-  for (i = 0; i < SUBMVREF_COUNT; ++i) {

-    printf("  {");

-    for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);

-    printf("},\n");

-  }

-  printf("};\n");

   printf("static const unsigned int\nmbsplit_counts"

          "[VP9_NUMMBSPLITS] = {\n");

-  for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);

+  for (t = 0; t < VP9_NUMMBSPLITS; ++t)

+    printf("%d, ", fc->mbsplit_counts[t]);

   printf("};\n");

-#if CONFIG_COMP_INTERINTRA_PRED

-  printf("static const unsigned int\ninterintra_counts"

-         "[2] = {\n");

-  for (t = 0; t < 2; ++t) printf("%d, ", cm->fc.interintra_counts[t]);

-  printf("};\n");

 #endif

-#endif

-  update_mode_probs(VP9_YMODES, vp9_ymode_tree,

-                    cm->fc.ymode_counts, cm->fc.pre_ymode_prob,

-                    cm->fc.ymode_prob, 0);

-  update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_tree,

-                    cm->fc.sb_ymode_counts, cm->fc.pre_sb_ymode_prob,

-                    cm->fc.sb_ymode_prob, 0);

-  for (i = 0; i < VP9_YMODES; ++i) {

-    update_mode_probs(VP9_UV_MODES, vp9_uv_mode_tree,

-                      cm->fc.uv_mode_counts[i], cm->fc.pre_uv_mode_prob[i],

-                      cm->fc.uv_mode_prob[i], 0);

-  }

-  update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree,

-                    cm->fc.bmode_counts, cm->fc.pre_bmode_prob,

-                    cm->fc.bmode_prob, 0);

-  update_mode_probs(VP9_I8X8_MODES,

-                    vp9_i8x8_mode_tree, cm->fc.i8x8_mode_counts,

-                    cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob, 0);

-  for (i = 0; i < SUBMVREF_COUNT; ++i) {

-    update_mode_probs(VP9_SUBMVREFS,

-                      vp9_sub_mv_ref_tree, cm->fc.sub_mv_ref_counts[i],

-                      cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i],

-                      LEFT4X4);

-  }

-  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree,

-                    cm->fc.mbsplit_counts, cm->fc.pre_mbsplit_prob,

-                    cm->fc.mbsplit_prob, 0);

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (cm->use_interintra) {

-    int factor, interintra_prob, count;

+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)

+    fc->intra_inter_prob[i] = update_mode_ct2(fc->pre_intra_inter_prob[i],

+                                              fc->intra_inter_count[i]);

+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)

+    fc->comp_inter_prob[i] = update_mode_ct2(fc->pre_comp_inter_prob[i],

+                                             fc->comp_inter_count[i]);

+  for (i = 0; i < REF_CONTEXTS; i++)

+    fc->comp_ref_prob[i] = update_mode_ct2(fc->pre_comp_ref_prob[i],

+                                           fc->comp_ref_count[i]);

+  for (i = 0; i < REF_CONTEXTS; i++)

+    for (j = 0; j < 2; j++)

+      fc->single_ref_prob[i][j] = update_mode_ct2(fc->pre_single_ref_prob[i][j],

+                                                  fc->single_ref_count[i][j]);

-    interintra_prob = get_binary_prob(cm->fc.interintra_counts[0],

-                                      cm->fc.interintra_counts[1]);

-    count = cm->fc.interintra_counts[0] + cm->fc.interintra_counts[1];

-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-    cm->fc.interintra_prob = weighted_prob(cm->fc.pre_interintra_prob,

-                                           interintra_prob, factor);

+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)

+    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,

+                      fc->y_mode_counts[i], fc->pre_y_mode_prob[i],

+                      fc->y_mode_prob[i], 0);

+  for (i = 0; i < VP9_INTRA_MODES; ++i)

+    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,

+                      fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i],

+                      fc->uv_mode_prob[i], 0);

+  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)

+    update_mode_probs(PARTITION_TYPES, vp9_partition_tree,

+                      fc->partition_counts[i], fc->pre_partition_prob[i],

+                      fc->partition_prob[INTER_FRAME][i], 0);

+  if (cm->mcomp_filter_type == SWITCHABLE) {

+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {

+      update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,

+                        fc->switchable_interp_count[i],

+                        fc->pre_switchable_interp_prob[i],

+                        fc->switchable_interp_prob[i], 0);

+    }

-#endif

+  if (cm->txfm_mode == TX_MODE_SELECT) {

+    int j;

+    unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];

+    unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];

+    unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];

+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {

+      tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],

+                                     branch_ct_8x8p);

+      for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {

+        int factor;

+        int count = branch_ct_8x8p[j][0] + branch_ct_8x8p[j][1];

+        vp9_prob prob = get_binary_prob(branch_ct_8x8p[j][0],

+                                        branch_ct_8x8p[j][1]);

+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+        cm->fc.tx_probs_8x8p[i][j] = weighted_prob(

+            cm->fc.pre_tx_probs_8x8p[i][j], prob, factor);

+      }

+    }

+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {

+      tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],

+                                       branch_ct_16x16p);

+      for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {

+        int factor;

+        int count = branch_ct_16x16p[j][0] + branch_ct_16x16p[j][1];

+        vp9_prob prob = get_binary_prob(branch_ct_16x16p[j][0],

+                                        branch_ct_16x16p[j][1]);

+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+        cm->fc.tx_probs_16x16p[i][j] = weighted_prob(

+            cm->fc.pre_tx_probs_16x16p[i][j], prob, factor);

+      }

+    }

+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {

+      tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],

+                                       branch_ct_32x32p);

+      for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {

+        int factor;

+        int count = branch_ct_32x32p[j][0] + branch_ct_32x32p[j][1];

+        vp9_prob prob = get_binary_prob(branch_ct_32x32p[j][0],

+                                        branch_ct_32x32p[j][1]);

+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+        cm->fc.tx_probs_32x32p[i][j] = weighted_prob(

+            cm->fc.pre_tx_probs_32x32p[i][j], prob, factor);

+      }

+    }

+  }

+  for (i = 0; i < MBSKIP_CONTEXTS; ++i)

+    fc->mbskip_probs[i] = update_mode_ct2(fc->pre_mbskip_probs[i],

+                                          fc->mbskip_count[i]);

 static void set_default_lf_deltas(MACROBLOCKD *xd) {

@@ -637,15 +473,13 @@

   xd->mode_ref_lf_delta_enabled = 1;

   xd->mode_ref_lf_delta_update = 1;

-  xd->ref_lf_deltas[INTRA_FRAME] = 2;

+  xd->ref_lf_deltas[INTRA_FRAME] = 1;

   xd->ref_lf_deltas[LAST_FRAME] = 0;

-  xd->ref_lf_deltas[GOLDEN_FRAME] = -2;

-  xd->ref_lf_deltas[ALTREF_FRAME] = -2;

+  xd->ref_lf_deltas[GOLDEN_FRAME] = -1;

+  xd->ref_lf_deltas[ALTREF_FRAME] = -1;

-  xd->mode_lf_deltas[0] = 4;               // BPRED

-  xd->mode_lf_deltas[1] = -2;              // Zero

-  xd->mode_lf_deltas[2] = 2;               // New mv

-  xd->mode_lf_deltas[3] = 4;               // Split mv

+  xd->mode_lf_deltas[0] = 0;              // Zero

+  xd->mode_lf_deltas[1] = 0;               // New mv

 void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {

@@ -655,9 +489,9 @@

   vp9_clearall_segfeatures(xd);

   xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

   if (cm->last_frame_seg_map)

-    vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols));

+    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));

-  /* reset the mode ref deltas for loop filter */

+  // Reset the mode ref deltas for loop filter

   vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));

   vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));

   set_default_lf_deltas(xd);

@@ -664,33 +498,38 @@

   vp9_default_coef_probs(cm);

   vp9_init_mbmode_probs(cm);

-  vp9_default_bmode_probs(cm->fc.bmode_prob);

-  vp9_kf_default_bmode_probs(cm->kf_bmode_prob);

+  vpx_memcpy(cm->kf_y_mode_prob, vp9_kf_default_bmode_probs,

+             sizeof(vp9_kf_default_bmode_probs));

   vp9_init_mv_probs(cm);

   // To force update of the sharpness

   cm->last_sharpness_level = -1;

   vp9_init_mode_contexts(cm);

-  for (i = 0; i < NUM_FRAME_CONTEXTS; i++) {

-    vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));

+  if ((cm->frame_type == KEY_FRAME) ||

+      cm->error_resilient_mode || (cm->reset_frame_context == 3)) {

+    // Reset all frame contexts.

+    for (i = 0; i < NUM_FRAME_CONTEXTS; ++i)

+      vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));

+  } else if (cm->reset_frame_context == 2) {

+    // Reset only the frame context specified in the frame header.

+    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,

+               sizeof(cm->fc));

   vpx_memset(cm->prev_mip, 0,

-             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

   vpx_memset(cm->mip, 0,

-             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));

   vp9_update_mode_info_border(cm, cm->mip);

   vp9_update_mode_info_in_image(cm, cm->mi);

-#if CONFIG_NEW_MVREF

-  // Defaults probabilities for encoding the MV ref id signal

-  vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,

-             sizeof(xd->mb_mv_ref_probs));

-#endif

-  cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0;

-  cm->ref_frame_sign_bias[ALTREF_FRAME] = 0;

+  vp9_update_mode_info_border(cm, cm->prev_mip);

+  vp9_update_mode_info_in_image(cm, cm->prev_mi);

+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));

   cm->frame_context_idx = 0;

--- a/vp9/common/vp9_entropymode.h

+++ b/vp9/common/vp9_entropymode.h

@@ -15,61 +15,35 @@

 #include "vp9/common/vp9_treecoder.h"

 #define SUBMVREF_COUNT 5

-#define VP9_NUMMBSPLITS 4

+#define TX_SIZE_CONTEXTS 2

-#if CONFIG_COMP_INTERINTRA_PRED

-#define VP9_DEF_INTERINTRA_PROB 248

-#define VP9_UPD_INTERINTRA_PROB 192

-// whether to use a separate uv mode (1) or use the same as the y mode (0)

-#define SEPARATE_INTERINTRA_UV  0

-#endif

+#define VP9_MODE_UPDATE_PROB  252

-typedef const int vp9_mbsplit[16];

+// #define MODE_STATS

-extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];

-extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */

-extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];

 extern int vp9_mv_cont(const int_mv *l, const int_mv *a);

-extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-extern const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES]

-                                                     [VP9_KF_BINTRAMODES]

-                                                     [VP9_KF_BINTRAMODES];

+extern const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]

+                                                [VP9_INTRA_MODES]

+                                                [VP9_INTRA_MODES - 1];

-extern const vp9_tree_index vp9_bmode_tree[];

-extern const vp9_tree_index vp9_kf_bmode_tree[];

-extern const vp9_tree_index  vp9_ymode_tree[];

-extern const vp9_tree_index  vp9_kf_ymode_tree[];

-extern const vp9_tree_index  vp9_uv_mode_tree[];

-#define vp9_sb_ymode_tree vp9_uv_mode_tree

-#define vp9_sb_kf_ymode_tree vp9_uv_mode_tree

-extern const vp9_tree_index  vp9_i8x8_mode_tree[];

-extern const vp9_tree_index  vp9_mbsplit_tree[];

-extern const vp9_tree_index  vp9_mv_ref_tree[];

+extern const vp9_tree_index vp9_intra_mode_tree[];

 extern const vp9_tree_index  vp9_sb_mv_ref_tree[];

-extern const vp9_tree_index  vp9_sub_mv_ref_tree[];

-extern struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES];

-extern struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES];

-extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];

-extern struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES];

-extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];

-extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];

-extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];

-extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];

-extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];

+extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];

 /* Inter mode values do not start at zero */

-extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];

-extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];

-extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];

+extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];

+// probability models for partition information

+extern const vp9_tree_index  vp9_partition_tree[];

+extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];

+extern const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES]

+                                         [NUM_PARTITION_CONTEXTS]

+                                         [PARTITION_TYPES - 1];

 void vp9_entropy_mode_init(void);

 struct VP9Common;

@@ -87,12 +61,6 @@

                               MB_PREDICTION_MODE m,

                               const int context);

-void vp9_default_bmode_probs(vp9_prob dest[VP9_NKF_BINTRAMODES - 1]);

-void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_KF_BINTRAMODES]

-                                             [VP9_KF_BINTRAMODES]

-                                             [VP9_KF_BINTRAMODES - 1]);

 void vp9_adapt_mode_probs(struct VP9Common *);

 #define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */

@@ -107,10 +75,22 @@

 extern const  vp9_tree_index vp9_switchable_interp_tree

                   [2 * (VP9_SWITCHABLE_FILTERS - 1)];

-extern struct vp9_token_struct vp9_switchable_interp_encodings

-                  [VP9_SWITCHABLE_FILTERS];

+extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

 extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

                                                  [VP9_SWITCHABLE_FILTERS - 1];

+extern const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]

+                                                 [TX_SIZE_MAX_SB - 1];

+extern const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]

+                                                 [TX_SIZE_MAX_SB - 2];

+extern const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]

+                                               [TX_SIZE_MAX_SB - 3];

+extern void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,

+                                             unsigned int (*ct_32x32p)[2]);

+extern void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,

+                                             unsigned int (*ct_16x16p)[2]);

+extern void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,

+                                           unsigned int (*ct_8x8p)[2]);

 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_

--- a/vp9/common/vp9_entropymv.c

+++ b/vp9/common/vp9_entropymv.c

@@ -14,16 +14,11 @@

 //#define MV_COUNT_TESTING

-#define MV_COUNT_SAT 16

-#define MV_MAX_UPDATE_FACTOR 160

+#define MV_COUNT_SAT 20

+#define MV_MAX_UPDATE_FACTOR 128

-#if CONFIG_NEW_MVREF

 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */

-#define COMPANDED_MVREF_THRESH    1000000

-#else

-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */

 #define COMPANDED_MVREF_THRESH    8

-#endif

 /* Smooth or bias the mv-counts before prob computation */

 /* #define SMOOTH_MV_COUNTS */

@@ -33,7 +28,7 @@

   -MV_JOINT_HNZVZ, 4,

   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ

};

-struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];

+struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];

 const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {

   -MV_CLASS_0, 2,

@@ -47,12 +42,12 @@

   -MV_CLASS_7, -MV_CLASS_8,

   -MV_CLASS_9, -MV_CLASS_10,

};

-struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];

+struct vp9_token vp9_mv_class_encodings[MV_CLASSES];

 const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {

   -0, -1,

};

-struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];

+struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];

 const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {

   -0, 2,

@@ -59,7 +54,7 @@

   -1, 4,

   -2, -3

};

-struct vp9_token_struct vp9_mv_fp_encodings[4];

+struct vp9_token vp9_mv_fp_encodings[4];

 const nmv_context vp9_default_nmv_context = {

   {32, 64, 96},

@@ -87,11 +82,15 @@

},

};

-MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {

-  if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;

-  else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;

-  else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;

-  else return MV_JOINT_HNZVNZ;

+MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {

+  if (mv->row == 0 && mv->col == 0)

+    return MV_JOINT_ZERO;

+  else if (mv->row == 0 && mv->col != 0)

+    return MV_JOINT_HNZVZ;

+  else if (mv->row != 0 && mv->col == 0)

+    return MV_JOINT_HZVNZ;

+  else

+    return MV_JOINT_HNZVNZ;

 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)

@@ -137,7 +136,8 @@

                                     int incr,

                                     int usehp) {

   int s, z, c, o, d, e, f;

-  if (!incr) return;

+  if (!incr)

+    return;

   assert (v != 0);            /* should not be zero */

   s = v < 0;

   mvcomp->sign[s] += incr;

@@ -152,8 +152,8 @@

   if (c == MV_CLASS_0) {

     mvcomp->class0[d] += incr;

   } else {

-    int i, b;

-    b = c + CLASS0_BITS - 1;  /* number of bits */

+    int i;

+    int b = c + CLASS0_BITS - 1;  // number of bits

     for (i = 0; i < b; ++i)

       mvcomp->bits[i][((d >> i) & 1)] += incr;

@@ -204,25 +204,22 @@

 void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,

                        int usehp) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

+  const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);

   mvctx->joints[j]++;

   usehp = usehp && vp9_use_nmv_hp(ref);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

+  if (mv_joint_vertical(j))

     increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

+  if (mv_joint_horizontal(j))

     increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);

-  }

-static void adapt_prob(vp9_prob *dest, vp9_prob prep,

-                       unsigned int ct[2]) {

-  int count = ct[0] + ct[1];

+static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {

+  const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT);

   if (count) {

-    vp9_prob newp = get_binary_prob(ct[0], ct[1]);

-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;

-    *dest = weighted_prob(prep, newp,

-                          MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);

+    const vp9_prob newp = get_binary_prob(ct[0], ct[1]);

+    const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT;

+    *dest = weighted_prob(prep, newp, factor);

   } else {

     *dest = prep;

@@ -253,10 +250,12 @@

                                    branch_ct_joint,

                                    nmv_count->joints, 0);

   for (i = 0; i < 2; ++i) {

-    prob->comps[i].sign = get_binary_prob(nmv_count->comps[i].sign[0],

-                                          nmv_count->comps[i].sign[1]);

-    branch_ct_sign[i][0] = nmv_count->comps[i].sign[0];

-    branch_ct_sign[i][1] = nmv_count->comps[i].sign[1];

+    const uint32_t s0 = nmv_count->comps[i].sign[0];

+    const uint32_t s1 = nmv_count->comps[i].sign[1];

+    prob->comps[i].sign = get_binary_prob(s0, s1);

+    branch_ct_sign[i][0] = s0;

+    branch_ct_sign[i][1] = s1;

     vp9_tree_probs_from_distribution(vp9_mv_class_tree,

                                      prob->comps[i].classes,

                                      branch_ct_classes[i],

@@ -266,10 +265,12 @@

                                      branch_ct_class0[i],

                                      nmv_count->comps[i].class0, 0);

     for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      prob->comps[i].bits[j] = get_binary_prob(nmv_count->comps[i].bits[j][0],

-                                               nmv_count->comps[i].bits[j][1]);

-      branch_ct_bits[i][j][0] = nmv_count->comps[i].bits[j][0];

-      branch_ct_bits[i][j][1] = nmv_count->comps[i].bits[j][1];

+      const uint32_t b0 = nmv_count->comps[i].bits[j][0];

+      const uint32_t b1 = nmv_count->comps[i].bits[j][1];

+      prob->comps[i].bits[j] = get_binary_prob(b0, b1);

+      branch_ct_bits[i][j][0] = b0;

+      branch_ct_bits[i][j][1] = b1;

   for (i = 0; i < 2; ++i) {

@@ -286,16 +287,18 @@

   if (usehp) {

     for (i = 0; i < 2; ++i) {

-      prob->comps[i].class0_hp =

-          get_binary_prob(nmv_count->comps[i].class0_hp[0],

-                          nmv_count->comps[i].class0_hp[1]);

-      branch_ct_class0_hp[i][0] = nmv_count->comps[i].class0_hp[0];

-      branch_ct_class0_hp[i][1] = nmv_count->comps[i].class0_hp[1];

+      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];

+      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];

+      const uint32_t hp0 = nmv_count->comps[i].hp[0];

+      const uint32_t hp1 = nmv_count->comps[i].hp[1];

-      prob->comps[i].hp = get_binary_prob(nmv_count->comps[i].hp[0],

-                                          nmv_count->comps[i].hp[1]);

-      branch_ct_hp[i][0] = nmv_count->comps[i].hp[0];

-      branch_ct_hp[i][1] = nmv_count->comps[i].hp[1];

+      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);

+      branch_ct_class0_hp[i][0] = c0_hp0;

+      branch_ct_class0_hp[i][1] = c0_hp1;

+      prob->comps[i].hp = get_binary_prob(hp0, hp1);

+      branch_ct_hp[i][0] = hp0;

+      branch_ct_hp[i][1] = hp1;

--- a/vp9/common/vp9_entropymv.h

+++ b/vp9/common/vp9_entropymv.h

@@ -24,14 +24,8 @@

 void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);

 int vp9_use_nmv_hp(const MV *ref);

-#define VP9_NMV_UPDATE_PROB  255

+#define VP9_NMV_UPDATE_PROB  252

-#if CONFIG_NEW_MVREF

-#define VP9_MVREF_UPDATE_PROB 252

-#define VP9_DEFAULT_MV_REF_PROB 192

-#define VP9_MV_REF_UPDATE_COST (14 << 8)

-#endif

 //#define MV_GROUP_UPDATE

 #define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */

@@ -45,8 +39,16 @@

   MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */

 } MV_JOINT_TYPE;

+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {

+  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;

+}

+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {

+  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;

+}

 extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];

-extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];

+extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];

 /* Symbols for coding magnitude class of nonzero components */

 #define MV_CLASSES     11

@@ -65,7 +67,7 @@

 } MV_CLASS_TYPE;

 extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];

-extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];

+extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];

 #define CLASS0_BITS    1  /* bits at integer precision for class 0 */

 #define CLASS0_SIZE    (1 << CLASS0_BITS)

@@ -76,10 +78,10 @@

 #define MV_VALS        ((MV_MAX << 1) + 1)

 extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];

-extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];

+extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];

 extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];

-extern struct vp9_token_struct vp9_mv_fp_encodings[4];

+extern struct vp9_token vp9_mv_fp_encodings[4];

 typedef struct {

   vp9_prob sign;

@@ -97,7 +99,7 @@

   nmv_component comps[2];

 } nmv_context;

-MV_JOINT_TYPE vp9_get_mv_joint(MV mv);

+MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv);

 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);

 int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);

--- /dev/null

+++ b/vp9/common/vp9_enums.h

@@ -1,0 +1,49 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_COMMON_VP9_ENUMS_H_

+#define VP9_COMMON_VP9_ENUMS_H_

+#include "./vpx_config.h"

+#define LOG2_MI_SIZE 3

+#define MI_SIZE (1 << LOG2_MI_SIZE)

+#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1)

+typedef enum BLOCK_SIZE_TYPE {

+  BLOCK_SIZE_AB4X4,

+  BLOCK_SIZE_SB4X8,

+  BLOCK_SIZE_SB8X4,

+  BLOCK_SIZE_SB8X8,

+  BLOCK_SIZE_SB8X16,

+  BLOCK_SIZE_SB16X8,

+  BLOCK_SIZE_MB16X16,

+  BLOCK_SIZE_SB16X32,

+  BLOCK_SIZE_SB32X16,

+  BLOCK_SIZE_SB32X32,

+  BLOCK_SIZE_SB32X64,

+  BLOCK_SIZE_SB64X32,

+  BLOCK_SIZE_SB64X64,

+  BLOCK_SIZE_TYPES

+} BLOCK_SIZE_TYPE;

+typedef enum PARTITION_TYPE {

+  PARTITION_NONE,

+  PARTITION_HORZ,

+  PARTITION_VERT,

+  PARTITION_SPLIT,

+  PARTITION_TYPES

+} PARTITION_TYPE;

+#define PARTITION_PLOFFSET   4  // number of probability models per block size

+#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)

+#endif  // VP9_COMMON_VP9_ENUMS_H_

--- a/vp9/common/vp9_extend.c

+++ b/vp9/common/vp9_extend.c

@@ -60,11 +60,23 @@

   const int eb_y = dst->border + dst->y_height - src->y_height;

   const int er_y = dst->border + dst->y_width - src->y_width;

-  const int et_uv = dst->border >> 1;

-  const int el_uv = dst->border >> 1;

-  const int eb_uv = (dst->border >> 1) + dst->uv_height - src->uv_height;

-  const int er_uv = (dst->border >> 1) + dst->uv_width - src->uv_width;

+  const int et_uv = dst->border >> (dst->uv_height != dst->y_height);

+  const int el_uv = dst->border >> (dst->uv_width != dst->y_width);

+  const int eb_uv = et_uv + dst->uv_height - src->uv_height;

+  const int er_uv = el_uv + dst->uv_width - src->uv_width;

+#if CONFIG_ALPHA

+  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);

+  const int el_a = dst->border >> (dst->alpha_width != dst->y_width);

+  const int eb_a = et_a + dst->alpha_height - src->alpha_height;

+  const int er_a = el_a + dst->alpha_width - src->alpha_width;

+  copy_and_extend_plane(src->alpha_buffer, src->alpha_stride,

+                        dst->alpha_buffer, dst->alpha_stride,

+                        src->alpha_width, src->alpha_height,

+                        et_a, el_a, eb_a, er_a);

+#endif

   copy_and_extend_plane(src->y_buffer, src->y_stride,

                         dst->y_buffer, dst->y_stride,

                         src->y_width, src->y_height,

@@ -78,7 +90,7 @@

   copy_and_extend_plane(src->v_buffer, src->uv_stride,

                         dst->v_buffer, dst->uv_stride,

                         src->uv_width, src->uv_height,

-                        et_y, el_y, eb_uv, er_uv);

+                        et_uv, el_uv, eb_uv, er_uv);

 void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,

@@ -118,30 +130,4 @@

                         dst->v_buffer + dst_uv_offset, dst->uv_stride,

                         srcw_uv, srch_uv,

                         et_uv, el_uv, eb_uv, er_uv);

-}

-// note the extension is only for the last row, for intra prediction purpose

-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf,

-                       uint8_t *y, uint8_t *u, uint8_t *v) {

-  int i;

-  y += buf->y_stride * 14;

-  u += buf->uv_stride * 6;

-  v += buf->uv_stride * 6;

-  for (i = 0; i < 4; i++) {

-    y[i] = y[-1];

-    u[i] = u[-1];

-    v[i] = v[-1];

-  }

-  y += buf->y_stride;

-  u += buf->uv_stride;

-  v += buf->uv_stride;

-  for (i = 0; i < 4; i++) {

-    y[i] = y[-1];

-    u[i] = u[-1];

-    v[i] = v[-1];

-  }

--- a/vp9/common/vp9_extend.h

+++ b/vp9/common/vp9_extend.h

@@ -22,9 +22,4 @@

                                          YV12_BUFFER_CONFIG *dst,

                                          int srcy, int srcx,

                                          int srch, int srcw);

-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf,

-                       uint8_t *y, uint8_t *u, uint8_t *v);

 #endif  // VP9_COMMON_VP9_EXTEND_H_

--- a/vp9/common/vp9_filter.c

+++ b/vp9/common/vp9_filter.c

@@ -34,12 +34,7 @@

   { 0, 0, 0,   8, 120, 0, 0, 0 }

};

-#define FILTER_ALPHA        0

-#define FILTER_ALPHA_SHARP  0

-#define FILTER_ALPHA_SMOOTH 50

-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8])

-    = {

-#if FILTER_ALPHA == 0

+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {

   /* Lagrangian interpolation filter */

   { 0,   0,   0, 128,   0,   0,   0,  0},

   { 0,   1,  -5, 126,   8,  -3,   1,  0},

@@ -57,38 +52,10 @@

   { -1,   3,  -9,  27, 118, -13,   4, -1},

   { 0,   2,  -6,  18, 122, -10,   3, -1},

   { 0,   1,  -3,   8, 126,  -5,   1,  0}

-#elif FILTER_ALPHA == 50

-  /* Generated using MATLAB:

-   * alpha = 0.5;

-   * b=intfilt(8,4,alpha);

-   * bi=round(128*b);

-   * ba=flipud(reshape([bi 0], 8, 8));

-   * disp(num2str(ba, '%d,'))

-   */

-  { 0,   0,   0, 128,   0,   0,   0,  0},

-  { 0,   1,  -5, 126,   8,  -3,   1,  0},

-  { 0,   2, -10, 122,  18,  -6,   2,  0},

-  { -1,   3, -13, 118,  27,  -9,   3,  0},

-  { -1,   4, -16, 112,  37, -11,   3,  0},

-  { -1,   5, -17, 104,  48, -14,   4, -1},

-  { -1,   5, -18,  96,  58, -16,   5, -1},

-  { -1,   5, -19,  88,  68, -17,   5, -1},

-  { -1,   5, -18,  78,  78, -18,   5, -1},

-  { -1,   5, -17,  68,  88, -19,   5, -1},

-  { -1,   5, -16,  58,  96, -18,   5, -1},

-  { -1,   4, -14,  48, 104, -17,   5, -1},

-  { 0,   3, -11,  37, 112, -16,   4, -1},

-  { 0,   3,  -9,  27, 118, -13,   3, -1},

-  { 0,   2,  -6,  18, 122, -10,   2,  0},

-  { 0,   1,  -3,   8, 126,  -5,   1,  0}

-#endif  /* FILTER_ALPHA */

};

 DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])

= {

-#if FILTER_ALPHA_SHARP == 0

   /* dct based filter */

   {0,   0,   0, 128,   0,   0,   0, 0},

   {-1,   3,  -7, 127,   8,  -3,   1, 0},

@@ -106,88 +73,25 @@

   {-2,   5, -10,  27, 121, -17,   7, -3},

   {-1,   3,  -6,  17, 125, -13,   5, -2},

   {0,   1,  -3,   8, 127,  -7,   3, -1}

-#elif FILTER_ALPHA_SHARP == 80

-  /* alpha = 0.80 */

-  { 0,   0,   0, 128,   0,   0,   0,  0},

-  {-1,   2,  -6, 127,   9,  -4,   2, -1},

-  {-2,   5, -12, 124,  18,  -7,   4, -2},

-  {-2,   7, -16, 119,  28, -11,   5, -2},

-  {-3,   8, -19, 114,  38, -14,   7, -3},

-  {-3,   9, -22, 107,  49, -17,   8, -3},

-  {-4,  10, -23,  99,  60, -20,  10, -4},

-  {-4,  11, -23,  90,  70, -22,  10, -4},

-  {-4,  11, -23,  80,  80, -23,  11, -4},

-  {-4,  10, -22,  70,  90, -23,  11, -4},

-  {-4,  10, -20,  60,  99, -23,  10, -4},

-  {-3,   8, -17,  49, 107, -22,   9, -3},

-  {-3,   7, -14,  38, 114, -19,   8, -3},

-  {-2,   5, -11,  28, 119, -16,   7, -2},

-  {-2,   4,  -7,  18, 124, -12,   5, -2},

-  {-1,   2,  -4,   9, 127,  -6,   2, -1}

-#endif  /* FILTER_ALPHA_SHARP */

};

 DECLARE_ALIGNED(256, const int16_t,

                 vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {

-  /* 8-tap lowpass filter */

-  /* Hamming window */

-  /* freqmultiplier = 0.625 */

-#if FILTER_ALPHA_SMOOTH == 625

-  {-1, -7, 32, 80, 32, -7, -1,  0},

-  {-1, -8, 28, 80, 37, -7, -2,  1},

-  { 0, -8, 24, 79, 41, -7, -2,  1},

-  { 0, -8, 20, 78, 45, -5, -3,  1},

-  { 0, -8, 16, 76, 50, -4, -3,  1},

-  { 0, -7, 13, 74, 54, -3, -4,  1},

-  { 1, -7,  9, 71, 58, -1, -4,  1},

-  { 1, -6,  6, 68, 62,  1, -5,  1},

-  { 1, -6,  4, 65, 65,  4, -6,  1},

-  { 1, -5,  1, 62, 68,  6, -6,  1},

-  { 1, -4, -1, 58, 71,  9, -7,  1},

-  { 1, -4, -3, 54, 74, 13, -7,  0},

-  { 1, -3, -4, 50, 76, 16, -8,  0},

-  { 1, -3, -5, 45, 78, 20, -8,  0},

-  { 1, -2, -7, 41, 79, 24, -8,  0},

-  { 1, -2, -7, 37, 80, 28, -8, -1}

-#elif FILTER_ALPHA_SMOOTH == 50

   /* freqmultiplier = 0.5 */

-  {-3,  0, 35, 64, 35,  0, -3, 0},

-  {-3, -1, 32, 64, 38,  1, -3, 0},

-  {-2, -2, 29, 63, 41,  2, -3, 0},

-  {-2, -2, 26, 63, 43,  4, -4, 0},

-  {-2, -3, 24, 62, 46,  5, -4, 0},

-  {-2, -3, 21, 60, 49,  7, -4, 0},

-  {-1, -4, 18, 59, 51,  9, -4, 0},

-  {-1, -4, 16, 57, 53, 12, -4, -1},

-  {-1, -4, 14, 55, 55, 14, -4, -1},

-  {-1, -4, 12, 53, 57, 16, -4, -1},

-  {0, -4,  9, 51, 59, 18, -4, -1},

-  {0, -4,  7, 49, 60, 21, -3, -2},

-  {0, -4,  5, 46, 62, 24, -3, -2},

-  {0, -4,  4, 43, 63, 26, -2, -2},

-  {0, -3,  2, 41, 63, 29, -2, -2},

-  {0, -3,  1, 38, 64, 32, -1, -3}

-#endif

-};

-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8])

-    = {

-  {0, 0,   0, 128,   0,   0, 0,  0},

-  {0, 1,  -5, 125,   8,  -2, 1,  0},

-  {0, 1,  -8, 122,  17,  -5, 1,  0},

-  {0, 2, -11, 116,  27,  -8, 2,  0},

-  {0, 3, -14, 110,  37, -10, 2,  0},

-  {0, 3, -15, 103,  47, -12, 2,  0},

-  {0, 3, -16,  95,  57, -14, 3,  0},

-  {0, 3, -16,  86,  67, -15, 3,  0},

-  {0, 3, -16,  77,  77, -16, 3,  0},

-  {0, 3, -15,  67,  86, -16, 3,  0},

-  {0, 3, -14,  57,  95, -16, 3,  0},

-  {0, 2, -12,  47, 103, -15, 3,  0},

-  {0, 2, -10,  37, 110, -14, 3,  0},

-  {0, 2,  -8,  27, 116, -11, 2,  0},

-  {0, 1,  -5,  17, 122,  -8, 1,  0},

-  {0, 1,  -2,   8, 125,  -5, 1,  0}

+  { 0,  0,  0, 128,  0,  0,  0,  0},

+  {-3, -1, 32,  64, 38,  1, -3,  0},

+  {-2, -2, 29,  63, 41,  2, -3,  0},

+  {-2, -2, 26,  63, 43,  4, -4,  0},

+  {-2, -3, 24,  62, 46,  5, -4,  0},

+  {-2, -3, 21,  60, 49,  7, -4,  0},

+  {-1, -4, 18,  59, 51,  9, -4,  0},

+  {-1, -4, 16,  57, 53, 12, -4, -1},

+  {-1, -4, 14,  55, 55, 14, -4, -1},

+  {-1, -4, 12,  53, 57, 16, -4, -1},

+  { 0, -4,  9,  51, 59, 18, -4, -1},

+  { 0, -4,  7,  49, 60, 21, -3, -2},

+  { 0, -4,  5,  46, 62, 24, -3, -2},

+  { 0, -4,  4,  43, 63, 26, -2, -2},

+  { 0, -3,  2,  41, 63, 29, -2, -2},

+  { 0, -3,  1,  38, 64, 32, -1, -3}

};

--- a/vp9/common/vp9_findnearmv.c

+++ b/vp9/common/vp9_findnearmv.c

@@ -8,22 +8,14 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

 #include <limits.h>

 #include "vp9/common/vp9_findnearmv.h"

+#include "vp9/common/vp9_mvref_common.h"

 #include "vp9/common/vp9_sadmxn.h"

 #include "vp9/common/vp9_subpelvar.h"

-const uint8_t vp9_mbsplit_offset[4][16] = {

-  { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

-  { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

-  { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}

-};

-static void lower_mv_precision(int_mv *mv, int usehp)

-{

+static void lower_mv_precision(int_mv *mv, int usehp) {

   if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {

     if (mv->as_mv.row & 1)

       mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);

@@ -32,288 +24,73 @@

-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

-                           vp9_prob p[4], const int context) {

-  p[0] = pc->fc.vp9_mode_contexts[context][0];

-  p[1] = pc->fc.vp9_mode_contexts[context][1];

-  p[2] = pc->fc.vp9_mode_contexts[context][2];

-  p[3] = pc->fc.vp9_mode_contexts[context][3];

+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, vp9_prob *p, int context) {

+  p[0] = pc->fc.inter_mode_probs[context][0];

+  p[1] = pc->fc.inter_mode_probs[context][1];

+  p[2] = pc->fc.inter_mode_probs[context][2];

   return p;

-#define SP(x) (((x) & 7) << 1)

-unsigned int vp9_sad3x16_c(const uint8_t *src_ptr,

-                           int  src_stride,

-                           const uint8_t *ref_ptr,

-                           int  ref_stride) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);

-}

-unsigned int vp9_sad16x3_c(const uint8_t *src_ptr,

-                           int  src_stride,

-                           const uint8_t *ref_ptr,

-                           int  ref_stride) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);

-}

-unsigned int vp9_variance2x16_c(const uint8_t *src_ptr,

-                                int  source_stride,

-                                const uint8_t *ref_ptr,

-                                int  recon_stride,

-                                unsigned int *sse) {

-  int sum;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, sse, &sum);

-  return (*sse - (((unsigned int)sum * sum) >> 5));

-}

-unsigned int vp9_variance16x2_c(const uint8_t *src_ptr,

-                                int  source_stride,

-                                const uint8_t *ref_ptr,

-                                int  recon_stride,

-                                unsigned int *sse) {

-  int sum;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, sse, &sum);

-  return (*sse - (((unsigned int)sum * sum) >> 5));

-}

-unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,

-                                          int  source_stride,

-                                          int  xoffset,

-                                          int  yoffset,

-                                          const uint8_t *ref_ptr,

-                                          int ref_stride,

-                                          unsigned int *sse) {

-  uint16_t FData3[16 * 3];  // Temp data buffer used in filtering

-  uint8_t temp2[2 * 16];

-  const int16_t *HFilter, *VFilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3,

-                                    source_stride, 1, 3, 16, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);

-  return vp9_variance16x2_c(temp2, 16, ref_ptr, ref_stride, sse);

-}

-unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,

-                                          int  source_stride,

-                                          int  xoffset,

-                                          int  yoffset,

-                                          const uint8_t *ref_ptr,

-                                          int ref_stride,

-                                          unsigned int *sse) {

-  uint16_t FData3[2 * 17];  // Temp data buffer used in filtering

-  uint8_t temp2[2 * 16];

-  const int16_t *HFilter, *VFilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3,

-                                    source_stride, 1, 17, 2, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);

-  return vp9_variance2x16_c(temp2, 2, ref_ptr, ref_stride, sse);

-}

-#if CONFIG_USESELECTREFMV

-/* check a list of motion vectors by sad score using a number rows of pixels

- * above and a number cols of pixels in the left to select the one with best

- * score to use as ref motion vector

- */

 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

-                           uint8_t *ref_y_buffer,

-                           int ref_y_stride,

                            int_mv *mvlist,

                            int_mv *nearest,

                            int_mv *near) {

-  int i, j;

-  uint8_t *above_src;

-  uint8_t *above_ref;

-#if !CONFIG_ABOVESPREFMV

-  uint8_t *left_src;

-  uint8_t *left_ref;

-#endif

-  unsigned int score;

-  unsigned int sse;

-  unsigned int ref_scores[MAX_MV_REF_CANDIDATES] = {0};

-  int_mv sorted_mvs[MAX_MV_REF_CANDIDATES];

-  int zero_seen = FALSE;

+  int i;

+  // Make sure all the candidates are properly clamped etc

+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

+    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);

+    clamp_mv2(&mvlist[i], xd);

+  }

+  *nearest = mvlist[0];

+  *near = mvlist[1];

+}

-  if (ref_y_buffer) {

+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,

+                                   int_mv *dst_nearest,

+                                   int_mv *dst_near,

+                                   int block_idx, int ref_idx) {

+  int_mv dst_list[MAX_MV_REF_CANDIDATES];

+  int_mv mv_list[MAX_MV_REF_CANDIDATES];

+  MODE_INFO *mi = xd->mode_info_context;

+  MB_MODE_INFO *const mbmi = &mi->mbmi;

-    // Default all to 0,0 if nothing else available

-    nearest->as_int = near->as_int = 0;

-    vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));

+  assert(ref_idx == 0 || ref_idx == 1);

+  assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier

-    above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;

-    above_ref = ref_y_buffer - ref_y_stride * 2;

-#if CONFIG_ABOVESPREFMV

-    above_src -= 4;

-    above_ref -= 4;

-#else

-    left_src  = xd->dst.y_buffer - 2;

-    left_ref  = ref_y_buffer - 2;

-#endif

+  vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,

+                       xd->prev_mode_info_context,

+                       mbmi->ref_frame[ref_idx],

+                       mv_list, cm->ref_frame_sign_bias, block_idx);

-    // Limit search to the predicted best few candidates

-    for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

-      int_mv this_mv;

-      int offset = 0;

-      int row_offset, col_offset;

+  dst_list[1].as_int = 0;

+  if (block_idx == 0) {

+    memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));

+  } else if (block_idx == 1 || block_idx == 2) {

+    int dst = 0, n;

+    union b_mode_info *bmi = mi->bmi;

-      this_mv.as_int = mvlist[i].as_int;

-      // If we see a 0,0 vector for a second time we have reached the end of

-      // the list of valid candidate vectors.

-      if (!this_mv.as_int && zero_seen)

-        break;

-      zero_seen = zero_seen || !this_mv.as_int;

-#if !CONFIG_ABOVESPREFMV

-      clamp_mv(&this_mv,

-               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,

-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

-               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

-#else

-      clamp_mv(&this_mv,

-               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,

-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

-               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

-#endif

-      row_offset = this_mv.as_mv.row >> 3;

-      col_offset = this_mv.as_mv.col >> 3;

-      offset = ref_y_stride * row_offset + col_offset;

-      score = 0;

-#if !CONFIG_ABOVESPREFMV

-      if (xd->up_available) {

-#else

-      if (xd->up_available && xd->left_available) {

-#endif

-        vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,

-                                   SP(this_mv.as_mv.col),

-                                   SP(this_mv.as_mv.row),

-                                   above_src, xd->dst.y_stride, &sse);

-        score += sse;

-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {

-          vp9_sub_pixel_variance16x2(above_ref + offset + 16,

-                                     ref_y_stride,

-                                     SP(this_mv.as_mv.col),

-                                     SP(this_mv.as_mv.row),

-                                     above_src + 16, xd->dst.y_stride, &sse);

-          score += sse;

-        }

-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {

-          vp9_sub_pixel_variance16x2(above_ref + offset + 32,

-                                     ref_y_stride,

-                                     SP(this_mv.as_mv.col),

-                                     SP(this_mv.as_mv.row),

-                                     above_src + 32, xd->dst.y_stride, &sse);

-          score += sse;

-          vp9_sub_pixel_variance16x2(above_ref + offset + 48,

-                                     ref_y_stride,

-                                     SP(this_mv.as_mv.col),

-                                     SP(this_mv.as_mv.row),

-                                     above_src + 48, xd->dst.y_stride, &sse);

-          score += sse;

-        }

-      }

-#if !CONFIG_ABOVESPREFMV

-      if (xd->left_available) {

-        vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,

-                                     SP(this_mv.as_mv.col),

-                                     SP(this_mv.as_mv.row),

-                                     left_src, xd->dst.y_stride, &sse);

-        score += sse;

-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {

-          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,

-                                       ref_y_stride,

-                                       SP(this_mv.as_mv.col),

-                                       SP(this_mv.as_mv.row),

-                                       left_src + xd->dst.y_stride * 16,

-                                       xd->dst.y_stride, &sse);

-          score += sse;

-        }

-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {

-          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,

-                                     ref_y_stride,

-                                       SP(this_mv.as_mv.col),

-                                       SP(this_mv.as_mv.row),

-                                       left_src + xd->dst.y_stride * 32,

-                                       xd->dst.y_stride, &sse);

-          score += sse;

-          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,

-                                       ref_y_stride,

-                                       SP(this_mv.as_mv.col),

-                                       SP(this_mv.as_mv.row),

-                                       left_src + xd->dst.y_stride * 48,

-                                       xd->dst.y_stride, &sse);

-          score += sse;

-        }

-      }

-#endif

-      // Add the entry to our list and then resort the list on score.

-      ref_scores[i] = score;

-      sorted_mvs[i].as_int = this_mv.as_int;

-      j = i;

-      while (j > 0) {

-        if (ref_scores[j] < ref_scores[j-1]) {

-          ref_scores[j] = ref_scores[j-1];

-          sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;

-          ref_scores[j-1] = score;

-          sorted_mvs[j-1].as_int = this_mv.as_int;

-          j--;

-        } else {

-          break;

-        }

-      }

-    }

+    dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;

+    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&

+                n < MAX_MV_REF_CANDIDATES; n++)

+      if (mv_list[n].as_int != dst_list[0].as_int)

+        dst_list[dst++].as_int = mv_list[n].as_int;

   } else {

-    vpx_memcpy(sorted_mvs, mvlist, sizeof(sorted_mvs));

-  }

+    int dst = 0, n;

+    union b_mode_info *bmi = mi->bmi;

-  // Make sure all the candidates are properly clamped etc

-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

-    lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);

-    clamp_mv2(&sorted_mvs[i], xd);

+    assert(block_idx == 3);

+    dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int;

+    if (dst_list[0].as_int != bmi[1].as_mv[ref_idx].as_int)

+      dst_list[dst++].as_int = bmi[1].as_mv[ref_idx].as_int;

+    if (dst < MAX_MV_REF_CANDIDATES &&

+        dst_list[0].as_int != bmi[0].as_mv[ref_idx].as_int)

+      dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;

+    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&

+                n < MAX_MV_REF_CANDIDATES; n++)

+      if (mv_list[n].as_int != dst_list[0].as_int)

+        dst_list[dst++].as_int = mv_list[n].as_int;

-  // Nearest may be a 0,0 or non zero vector and now matches the chosen

-  // "best reference". This has advantages when it is used as part of a

-  // compound predictor as it means a non zero vector can be paired using

-  // this mode with a 0 vector. The Near vector is still forced to be a

-  // non zero candidate if one is avaialble.

-  nearest->as_int = sorted_mvs[0].as_int;

-  if ( sorted_mvs[1].as_int ) {

-    near->as_int = sorted_mvs[1].as_int;

-  } else {

-    near->as_int = sorted_mvs[2].as_int;

-  }

-  // Copy back the re-ordered mv list

-  vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));

+  dst_nearest->as_int = dst_list[0].as_int;

+  dst_near->as_int = dst_list[1].as_int;

-#else

-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

-                           uint8_t *ref_y_buffer,

-                           int ref_y_stride,

-                           int_mv *mvlist,

-                           int_mv *nearest,

-                           int_mv *near) {

-  int i;

-  // Make sure all the candidates are properly clamped etc

-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

-    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);

-    clamp_mv2(&mvlist[i], xd);

-  }

-  *nearest = mvlist[0];

-  *near = mvlist[1];

-}

-#endif

--- a/vp9/common/vp9_findnearmv.h

+++ b/vp9/common/vp9_findnearmv.h

@@ -17,16 +17,13 @@

 #include "vp9/common/vp9_treecoder.h"

 #include "vp9/common/vp9_onyxc_int.h"

-#define LEFT_TOP_MARGIN (16 << 3)

-#define RIGHT_BOTTOM_MARGIN (16 << 3)

+#define LEFT_TOP_MARGIN     ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)

+#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)

-/* check a list of motion vectors by sad score using a number rows of pixels

- * above and a number cols of pixels in the left to select the one with best

- * score to use as ref motion vector

- */

+// check a list of motion vectors by sad score using a number rows of pixels

+// above and a number cols of pixels in the left to select the one with best

+// score to use as ref motion vector

 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

-                           uint8_t *ref_y_buffer,

-                           int ref_y_stride,

                            int_mv *mvlist,

                            int_mv *nearest,

                            int_mv *near);

@@ -43,35 +40,30 @@

   mvp->as_mv = xmv;

+// TODO(jingning): this mv clamping function should be block size dependent.

 static void clamp_mv(int_mv *mv,

                      int mb_to_left_edge,

                      int mb_to_right_edge,

                      int mb_to_top_edge,

                      int mb_to_bottom_edge) {

-  mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?

-                  mb_to_left_edge : mv->as_mv.col;

-  mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?

-                  mb_to_right_edge : mv->as_mv.col;

-  mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?

-                  mb_to_top_edge : mv->as_mv.row;

-  mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?

-                  mb_to_bottom_edge : mv->as_mv.row;

+  mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge);

+  mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge);

-static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {

+static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {

+  int_mv tmp_mv;

+  tmp_mv.as_int = mv->as_int;

   clamp_mv(mv,

            xd->mb_to_left_edge - LEFT_TOP_MARGIN,

            xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

            xd->mb_to_top_edge - LEFT_TOP_MARGIN,

            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

+  return tmp_mv.as_int != mv->as_int;

-static unsigned int check_mv_bounds(int_mv *mv,

-                                    int mb_to_left_edge,

-                                    int mb_to_right_edge,

-                                    int mb_to_top_edge,

-                                    int mb_to_bottom_edge) {

+static int check_mv_bounds(int_mv *mv,

+                           int mb_to_left_edge, int mb_to_right_edge,

+                           int mb_to_top_edge, int mb_to_bottom_edge) {

   return mv->as_mv.col < mb_to_left_edge ||

          mv->as_mv.col > mb_to_right_edge ||

          mv->as_mv.row < mb_to_top_edge ||

@@ -79,116 +71,50 @@

 vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

-                           vp9_prob p[VP9_MVREFS - 1],

+                           vp9_prob p[VP9_INTER_MODES - 1],

                            const int context);

-extern const uint8_t vp9_mbsplit_offset[4][16];

+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,

+                                   MACROBLOCKD *xd,

+                                   int_mv *dst_nearest,

+                                   int_mv *dst_near,

+                                   int block_idx, int ref_idx);

-static int left_block_mv(const MACROBLOCKD *xd,

-                         const MODE_INFO *cur_mb, int b) {

-  if (!(b & 3)) {

-    if (!xd->left_available)

-      return 0;

-    // On L edge, get from MB to left of us

-    --cur_mb;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.mv[0].as_int;

-    b += 4;

-  }

-  return (cur_mb->bmi + b - 1)->as_mv[0].as_int;

-}

-static int left_block_second_mv(const MACROBLOCKD *xd,

-                                const MODE_INFO *cur_mb, int b) {

-  if (!(b & 3)) {

-    if (!xd->left_available)

-      return 0;

+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {

+  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't

+  // understand this condition. This will go away soon.

+  if (b == 0 || b == 2) {

     /* On L edge, get from MB to left of us */

     --cur_mb;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.second_ref_frame > 0 ?

-          cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;

-    b += 4;

-  }

-  return cur_mb->mbmi.second_ref_frame > 0 ?

-      (cur_mb->bmi + b - 1)->as_mv[1].as_int :

-      (cur_mb->bmi + b - 1)->as_mv[0].as_int;

-}

-static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

-  if (!(b >> 2)) {

-    /* On top edge, get from MB above us */

-    cur_mb -= mi_stride;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.mv[0].as_int;

-    b += 16;

-  }

-  return (cur_mb->bmi + b - 4)->as_mv[0].as_int;

-}

-static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

-  if (!(b >> 2)) {

-    /* On top edge, get from MB above us */

-    cur_mb -= mi_stride;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.second_ref_frame > 0 ?

-          cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;

-    b += 16;

-  }

-  return cur_mb->mbmi.second_ref_frame > 0 ?

-      (cur_mb->bmi + b - 4)->as_mv[1].as_int :

-      (cur_mb->bmi + b - 4)->as_mv[0].as_int;

-}

-static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {

-  if (!(b & 3)) {

-    /* On L edge, get from MB to left of us */

-    --cur_mb;

-    if (cur_mb->mbmi.mode < I8X8_PRED) {

-      return pred_mode_conv(cur_mb->mbmi.mode);

-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {

-      return pred_mode_conv(

-          (MB_PREDICTION_MODE)(cur_mb->bmi + 3 + b)->as_mode.first);

-    } else if (cur_mb->mbmi.mode == B_PRED) {

-      return ((cur_mb->bmi + 3 + b)->as_mode.first);

+    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {

+      return DC_PRED;

+    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {

+      return ((cur_mb->bmi + 1 + b)->as_mode.first);

     } else {

-      return B_DC_PRED;

+      return cur_mb->mbmi.mode;

+  assert(b == 1 || b == 3);

   return (cur_mb->bmi + b - 1)->as_mode.first;

-static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,

+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,

                                           int b, int mi_stride) {

-  if (!(b >> 2)) {

+  if (!(b >> 1)) {

     /* On top edge, get from MB above us */

     cur_mb -= mi_stride;

-    if (cur_mb->mbmi.mode < I8X8_PRED) {

-      return pred_mode_conv(cur_mb->mbmi.mode);

-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {

-      return pred_mode_conv(

-          (MB_PREDICTION_MODE)(cur_mb->bmi + 12 + b)->as_mode.first);

-    } else if (cur_mb->mbmi.mode == B_PRED) {

-      return ((cur_mb->bmi + 12 + b)->as_mode.first);

+    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {

+      return DC_PRED;

+    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {

+      return ((cur_mb->bmi + 2 + b)->as_mode.first);

     } else {

-      return B_DC_PRED;

+      return cur_mb->mbmi.mode;

-  return (cur_mb->bmi + b - 4)->as_mode.first;

+  return (cur_mb->bmi + b - 2)->as_mode.first;

 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_

--- a/vp9/common/vp9_header.h

+++ /dev/null

@@ -1,40 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_VP9_HEADER_H_

-#define VP9_COMMON_VP9_HEADER_H_

-/* 24 bits total */

-typedef struct {

-  unsigned int type: 1;

-  unsigned int version: 3;

-  unsigned int show_frame: 1;

-  /* Allow 2^20 bytes = 8 megabits for first partition */

-  unsigned int first_partition_length_in_bytes: 19;

-#ifdef PACKET_TESTING

-  unsigned int frame_number;

-  unsigned int update_gold: 1;

-  unsigned int uses_gold: 1;

-  unsigned int update_last: 1;

-  unsigned int uses_last: 1;

-#endif

-} VP9_HEADER;

-#ifdef PACKET_TESTING

-#define VP9_HEADER_SIZE 8

-#else

-#define VP9_HEADER_SIZE 3

-#endif

-#endif  // VP9_COMMON_VP9_HEADER_H_

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -18,84 +18,84 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_idct.h"

-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

+   0.5 shifts per pixel. */

   int i;

-  int a1, b1, c1, d1;

+  int16_t output[16];

+  int a1, b1, c1, d1, e1;

   int16_t *ip = input;

   int16_t *op = output;

-  const int half_pitch = pitch >> 1;

   for (i = 0; i < 4; i++) {

-    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;

-    b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR;

-    c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR;

-    d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

+    a1 = ip[0] >> WHT_UPSCALE_FACTOR;

+    c1 = ip[1] >> WHT_UPSCALE_FACTOR;

+    d1 = ip[2] >> WHT_UPSCALE_FACTOR;

+    b1 = ip[3] >> WHT_UPSCALE_FACTOR;

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    op[0] = a1;

+    op[1] = b1;

+    op[2] = c1;

+    op[3] = d1;

     ip += 4;

-    op += half_pitch;

+    op += 4;

   ip = output;

-  op = output;

   for (i = 0; i < 4; i++) {

-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];

-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];

-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];

-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];

+    a1 = ip[4 * 0];

+    c1 = ip[4 * 1];

+    d1 = ip[4 * 2];

+    b1 = ip[4 * 3];

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);

+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);

+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);

+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);

-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;

-    op[half_pitch * 1] = (c1 + d1) >> 1;

-    op[half_pitch * 2] = (a1 - b1) >> 1;

-    op[half_pitch * 3] = (d1 - c1) >> 1;

     ip++;

-    op++;

+    dest++;

-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {

+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {

   int i;

+  int a1, e1;

   int16_t tmp[4];

   int16_t *ip = in;

   int16_t *op = tmp;

-  const int half_pitch = pitch >> 1;

-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

-  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;

+  a1 = ip[0] >> WHT_UPSCALE_FACTOR;

+  e1 = a1 >> 1;

+  a1 -= e1;

+  op[0] = a1;

+  op[1] = op[2] = op[3] = e1;

   ip = tmp;

-  op = out;

   for (i = 0; i < 4; i++) {

-    op[half_pitch * 0] = (ip[0] + 1) >> 1;

-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;

+    e1 = ip[0] >> 1;

+    a1 = ip[0] - e1;

+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);

+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);

+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);

+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);

     ip++;

-    op++;

+    dest++;

-void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,

-                                 uint8_t *dst_ptr,

-                                 int pitch, int stride) {

-  int r, c;

-  int16_t dc = input_dc;

-  int16_t tmp[4 * 4];

-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

-}

 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {

   int16_t step[4];

   int temp1, temp2;

@@ -116,10 +116,9 @@

   output[3] = step[0] - step[3];

-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[4 * 4];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[4], temp_out[4];

@@ -138,22 +137,24 @@

       temp_in[j] = out[j * 4 + i];

     vp9_idct4_1d(temp_in, temp_out);

     for (j = 0; j < 4; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

+                                  + dest[j * dest_stride + i]);

-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int i;

   int a1;

-  int16_t *op = output;

-  const int half_pitch = pitch >> 1;

   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

   a1 = ROUND_POWER_OF_TWO(out, 4);

   for (i = 0; i < 4; i++) {

-    op[0] = op[1] = op[2] = op[3] = a1;

-    op += half_pitch;

+    dest[0] = clip_pixel(dest[0] + a1);

+    dest[1] = clip_pixel(dest[1] + a1);

+    dest[2] = clip_pixel(dest[2] + a1);

+    dest[3] = clip_pixel(dest[3] + a1);

+    dest += dest_stride;

@@ -219,14 +220,13 @@

   output[7] = step1[0] - step1[7];

-void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[8 * 8];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[8], temp_out[8];

-  // Rows

+  // First transform rows

   for (i = 0; i < 8; ++i) {

     idct8_1d(input, outptr);

     input += 8;

@@ -233,13 +233,14 @@

     outptr += 8;

-  // Columns

+  // Then transform columns

   for (i = 0; i < 8; ++i) {

     for (j = 0; j < 8; ++j)

       temp_in[j] = out[j * 8 + i];

     idct8_1d(temp_in, temp_out);

     for (j = 0; j < 8; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

+                                  + dest[j * dest_stride + i]);

@@ -285,8 +286,8 @@

   output[3] = dct_const_round_shift(s3);

-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,

-                        int pitch, int tx_type) {

+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,

+                            int tx_type) {

   const transform_2d IHT_4[] = {

     { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0

     { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1

@@ -312,10 +313,10 @@

       temp_in[j] = out[j * 4 + i];

     IHT_4[tx_type].cols(temp_in, temp_out);

     for (j = 0; j < 4; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

+                                  + dest[j * dest_stride + i]);

 static void iadst8_1d(int16_t *input, int16_t *output) {

   int s0, s1, s2, s3, s4, s5, s6, s7;

@@ -400,8 +401,8 @@

   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3

};

-void vp9_short_iht8x8_c(int16_t *input, int16_t *output,

-                        int pitch, int tx_type) {

+void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,

+                            int tx_type) {

   int i, j;

   int16_t out[8 * 8];

   int16_t *outptr = out;

@@ -421,14 +422,14 @@

       temp_in[j] = out[j * 8 + i];

     ht.cols(temp_in, temp_out);

     for (j = 0; j < 8; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

-  }

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

+                                  + dest[j * dest_stride + i]);  }

-void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,

+                                int dest_stride) {

   int16_t out[8 * 8];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[8], temp_out[8];

@@ -447,7 +448,8 @@

       temp_in[j] = out[j * 8 + i];

     idct8_1d(temp_in, temp_out);

     for (j = 0; j < 8; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

+                                  + dest[j * dest_stride + i]);

@@ -621,10 +623,9 @@

   output[15] = step2[0] - step2[15];

-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[16 * 16];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[16], temp_out[16];

@@ -641,7 +642,8 @@

       temp_in[j] = out[j * 16 + i];

     idct16_1d(temp_in, temp_out);

     for (j = 0; j < 16; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

@@ -823,8 +825,8 @@

   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3

};

-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,

-                          int pitch, int tx_type) {

+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,

+                              int tx_type) {

   int i, j;

   int16_t out[16 * 16];

   int16_t *outptr = out;

@@ -844,38 +846,38 @@

       temp_in[j] = out[j * 16 + i];

     ht.cols(temp_in, temp_out);

     for (j = 0; j < 16; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-  }

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);  }

-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

-    int16_t out[16 * 16];

-    int16_t *outptr = out;

-    const int half_pitch = pitch >> 1;

-    int i, j;

-    int16_t temp_in[16], temp_out[16];

+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,

+                                  int dest_stride) {

+  int16_t out[16 * 16];

+  int16_t *outptr = out;

+  int i, j;

+  int16_t temp_in[16], temp_out[16];

-    /* First transform rows. Since all non-zero dct coefficients are in

-     * upper-left 4x4 area, we only need to calculate first 4 rows here.

-     */

-    vpx_memset(out, 0, sizeof(out));

-    for (i = 0; i < 4; ++i) {

-      idct16_1d(input, outptr);

-      input += 16;

-      outptr += 16;

-    }

+  /* First transform rows. Since all non-zero dct coefficients are in

+   * upper-left 4x4 area, we only need to calculate first 4 rows here.

+   */

+  vpx_memset(out, 0, sizeof(out));

+  for (i = 0; i < 4; ++i) {

+    idct16_1d(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

-    // Then transform columns

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j*16 + i];

-      idct16_1d(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-    }

+  // Then transform columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j*16 + i];

+    idct16_1d(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

+  }

 void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {

   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

@@ -1249,10 +1251,9 @@

   output[31] = step1[0] - step1[31];

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[32 * 32];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[32], temp_out[32];

@@ -1269,7 +1270,8 @@

       temp_in[j] = out[j * 32 + i];

     idct32_1d(temp_in, temp_out);

     for (j = 0; j < 32; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

@@ -1279,10 +1281,10 @@

   output[0] = ROUND_POWER_OF_TWO(out, 6);

-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,

+                                  int dest_stride) {

   int16_t out[32 * 32];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[32], temp_out[32];

@@ -1302,6 +1304,7 @@

       temp_in[j] = out[j * 32 + i];

     idct32_1d(temp_in, temp_out);

     for (j = 0; j < 32; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -17,6 +17,7 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_common.h"

 // Constants and Macros used by all idct/dct functions

 #define DCT_CONST_BITS 14

 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))

--- a/vp9/common/vp9_implicit_segmentation.c

+++ b/vp9/common/vp9_implicit_segmentation.c

@@ -140,11 +140,11 @@

           break;

         case SEGMENT_MV:

           n = mi[mb_index].mbmi.mv[0].as_int;

-          if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)

+          if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME)

             n = -9999999;

           break;

         case SEGMENT_REFFRAME:

-          n = mi[mb_index].mbmi.ref_frame;

+          n = mi[mb_index].mbmi.ref_frame[0];

           break;

         case SEGMENT_SKIPPED:

           n = mi[mb_index].mbmi.mb_skip_coeff;

@@ -191,11 +191,12 @@

   // give new labels to regions

   for (i = 1; i < label; i++)

-    if (labels[i].next->count > min_mbs_in_region  &&  labels[labels[i].next->label].label == 0) {

+    if (labels[i].next->count > min_mbs_in_region &&

+        labels[labels[i].next->label].label == 0) {

       segment_info *cs = &segments[label_count];

       cs->label = label_count;

       labels[labels[i].next->label].label = label_count++;

-      labels[labels[i].next->label].seg_value  = labels[i].next->seg_value;

+      labels[labels[i].next->label].seg_value = labels[i].next->seg_value;

       cs->seg_value = labels[labels[i].next->label].seg_value;

       cs->min_x = oci->mb_cols;

       cs->min_y = oci->mb_rows;

@@ -204,24 +205,21 @@

       cs->sum_x = 0;

       cs->sum_y = 0;

       cs->pixels = 0;

   lp = labeling;

   // this is just to gather stats...

   for (i = 0; i < oci->mb_rows; i++, lp += pitch) {

     for (j = 0; j < oci->mb_cols; j++) {

-      segment_info *cs;

-      int oldlab = labels[lp[j]].next->label;

-      int lab = labels[oldlab].label;

-      lp[j] = lab;

+      const int old_lab = labels[lp[j]].next->label;

+      const int lab = labels[old_lab].label;

+      segment_info *cs = &segments[lab];

-      cs = &segments[lab];

-      cs->min_x = (j < cs->min_x ? j : cs->min_x);

-      cs->max_x = (j > cs->max_x ? j : cs->max_x);

-      cs->min_y = (i < cs->min_y ? i : cs->min_y);

-      cs->max_y = (i > cs->max_y ? i : cs->max_y);

+      cs->min_x = MIN(cs->min_x, j);

+      cs->max_x = MAX(cs->max_x, j);

+      cs->min_y = MIN(cs->min_y, i);

+      cs->max_y = MAX(cs->max_y, i);

       cs->sum_x += j;

       cs->sum_y += i;

       cs->pixels++;

--- a/vp9/common/vp9_invtrans.c

+++ b/vp9/common/vp9_invtrans.c

@@ -11,311 +11,10 @@

 #include "vp9/common/vp9_invtrans.h"

 #include "./vp9_rtcd.h"

-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,

-                                 int16_t *dqcoeff, int16_t *diff,

-                                 int pitch) {

+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,

+                                     uint8_t *dest, int stride) {

   if (eob <= 1)

-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);

+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);

   else

-    xd->inv_txm4x4(dqcoeff, diff, pitch);

-}

-void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {

-  int i;

-  for (i = 0; i < 16; i++) {

-    TX_TYPE tx_type = get_tx_type_4x4(xd, i);

-    if (tx_type != DCT_DCT) {

-      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);

-    } else {

-      vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,

-                                  xd->block[i].diff, 32);

-    }

-  }

-}

-void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd) {

-  int i;

-  for (i = 16; i < 24; i++) {

-    vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,

-                                xd->block[i].diff, 16);

-  }

-}

-void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd) {

-  vp9_inverse_transform_mby_4x4(xd);

-  vp9_inverse_transform_mbuv_4x4(xd);

-}

-void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff,

-                                 int pitch) {

-  vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch);

-}

-void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {

-  int i;

-  BLOCKD *blockd = xd->block;

-  for (i = 0; i < 9; i += 8) {

-    TX_TYPE tx_type = get_tx_type_8x8(xd, i);

-    if (tx_type != DCT_DCT) {

-      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);

-    } else {

-      vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],

-                                  &blockd[i].diff[0], 32);

-    }

-  }

-  for (i = 2; i < 11; i += 8) {

-    TX_TYPE tx_type = get_tx_type_8x8(xd, i);

-    if (tx_type != DCT_DCT) {

-      vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,

-                           16, tx_type);

-    } else {

-      vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],

-                                  &blockd[i].diff[0], 32);

-    }

-  }

-}

-void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd) {

-  int i;

-  BLOCKD *blockd = xd->block;

-  for (i = 16; i < 24; i += 4) {

-    vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],

-                                &blockd[i].diff[0], 16);

-  }

-}

-void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd) {

-  vp9_inverse_transform_mby_8x8(xd);

-  vp9_inverse_transform_mbuv_8x8(xd);

-}

-void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,

-                                   int16_t *output_coeff, int pitch) {

-  vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch);

-}

-void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {

-  BLOCKD *bd = &xd->block[0];

-  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);

-  if (tx_type != DCT_DCT) {

-    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type);

-  } else {

-    vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],

-                                  &xd->block[0].diff[0], 32);

-  }

-}

-void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) {

-  vp9_inverse_transform_mby_16x16(xd);

-  vp9_inverse_transform_mbuv_8x8(xd);

-}

-void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd) {

-  vp9_short_idct32x32(xd->dqcoeff, xd->diff, 64);

-}

-void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);

-    if (tx_type == DCT_DCT) {

-      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,

-                                    xd->diff + x_idx * 16 + y_idx * 32 * 16,

-                                    64);

-    } else {

-      vp9_short_iht16x16(xd->dqcoeff + n * 256,

-                         xd->diff + x_idx * 16 + y_idx * 32 * 16, 32, tx_type);

-    }

-  }

-}

-void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);

-    if (tx_type == DCT_DCT) {

-      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,

-                                  xd->diff + x_idx * 8 + y_idx * 32 * 8, 64);

-    } else {

-      vp9_short_iht8x8(xd->dqcoeff + n * 64,

-                       xd->diff + x_idx * 8 + y_idx * 32 * 8, 32, tx_type);

-    }

-  }

-}

-void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3;

-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);

-    if (tx_type == DCT_DCT) {

-      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,

-                                  xd->diff + x_idx * 4 + y_idx * 4 * 32, 64);

-    } else {

-      vp9_short_iht4x4(xd->dqcoeff + n * 16,

-                       xd->diff + x_idx * 4 + y_idx * 4 * 32, 32, tx_type);

-    }

-  }

-}

-void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd) {

-  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1024,

-                                xd->diff + 1024, 32);

-  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1280,

-                                xd->diff + 1280, 32);

-}

-void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1024 + n * 64,

-                                xd->diff + 1024 + x_idx * 8 + y_idx * 16 * 8,

-                                32);

-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1280 + n * 64,

-                                xd->diff + 1280 + x_idx * 8 + y_idx * 16 * 8,

-                                32);

-  }

-}

-void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + n],

-                                xd->dqcoeff + 1024 + n * 16,

-                                xd->diff + 1024 + x_idx * 4 + y_idx * 16 * 4,

-                                32);

-    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + 16 + n],

-                                xd->dqcoeff + 1280 + n * 16,

-                                xd->diff + 1280 + x_idx * 4 + y_idx * 16 * 4,

-                                32);

-  }

-}

-void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    vp9_short_idct32x32(xd->dqcoeff + n * 1024,

-                        xd->diff + x_idx * 32 + y_idx * 32 * 64, 128);

-  }

-}

-void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);

-    if (tx_type == DCT_DCT) {

-      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,

-                                    xd->diff + x_idx * 16 + y_idx * 64 * 16,

-                                    128);

-    } else {

-      vp9_short_iht16x16(xd->dqcoeff + n * 256,

-                         xd->diff + x_idx * 16 + y_idx * 64 * 16, 64, tx_type);

-    }

-  }

-}

-void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3;

-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);

-    if (tx_type == DCT_DCT) {

-      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,

-                                  xd->diff + x_idx * 8 + y_idx * 64 * 8, 128);

-    } else {

-      vp9_short_iht8x8(xd->dqcoeff + n * 64,

-                       xd->diff + x_idx * 8 + y_idx * 64 * 8, 64, tx_type);

-    }

-  }

-}

-void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 256; n++) {

-    const int x_idx = n & 15, y_idx = n >> 4;

-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);

-    if (tx_type == DCT_DCT) {

-      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,

-                                  xd->diff + x_idx * 4 + y_idx * 4 * 64, 128);

-    } else {

-      vp9_short_iht4x4(xd->dqcoeff + n * 16,

-                       xd->diff + x_idx * 4 + y_idx * 4 * 64, 64, tx_type);

-    }

-  }

-}

-void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd) {

-  vp9_short_idct32x32(xd->dqcoeff + 4096,

-                      xd->diff + 4096, 64);

-  vp9_short_idct32x32(xd->dqcoeff + 4096 + 1024,

-                      xd->diff + 4096 + 1024, 64);

-}

-void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1, off = x_idx * 16 + y_idx * 32 * 16;

-    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + n * 256,

-                                  xd->diff + 4096 + off, 64);

-    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + 1024 + n * 256,

-                                  xd->diff + 4096 + 1024 + off, 64);

-  }

-}

-void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2, off = x_idx * 8 + y_idx * 32 * 8;

-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + n * 64,

-                                xd->diff + 4096 + off, 64);

-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + 1024 + n * 64,

-                                xd->diff + 4096 + 1024 + off, 64);

-  }

-}

-void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd) {

-  int n;

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3, off = x_idx * 4 + y_idx * 32 * 4;

-    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + n],

-                                xd->dqcoeff + 4096 + n * 16,

-                                xd->diff + 4096 + off, 64);

-    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + 64 + n],

-                                xd->dqcoeff + 4096 + 1024 + n * 16,

-                                xd->diff + 4096 + 1024 + off, 64);

-  }

+    xd->inv_txm4x4_add(dqcoeff, dest, stride);

--- a/vp9/common/vp9_invtrans.h

+++ b/vp9/common/vp9_invtrans.h

@@ -15,47 +15,6 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_blockd.h"

-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,

-                                 int16_t *dqcoeff, int16_t *diff,

-                                 int pitch);

-void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);

-void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);

-void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);

-void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,

-                                        int16_t *output_coeff, int pitch);

-void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);

-void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);

-void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);

-void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,

-                                          int16_t *output_coeff, int pitch);

-void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);

-void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);

-void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd);

-void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd);

-void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd);

-void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd);

-void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd);

-void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd);

-void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd);

-void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd);

+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,

+                                     uint8_t *dest, int stride);

 #endif  // VP9_COMMON_VP9_INVTRANS_H_

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -11,46 +11,26 @@

 #include "vpx_config.h"

 #include "vp9/common/vp9_loopfilter.h"

 #include "vp9/common/vp9_onyxc_int.h"

+#include "vp9/common/vp9_reconinter.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vp9/common/vp9_seg_common.h"

 static void lf_init_lut(loop_filter_info_n *lfi) {

-  int filt_lvl;

-  for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {

-    if (filt_lvl >= 40) {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;

-    } else if (filt_lvl >= 20) {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;

-    } else if (filt_lvl >= 15) {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;

-    } else {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;

-    }

-  }

-  lfi->mode_lf_lut[DC_PRED] = 1;

-  lfi->mode_lf_lut[D45_PRED] = 1;

-  lfi->mode_lf_lut[D135_PRED] = 1;

-  lfi->mode_lf_lut[D117_PRED] = 1;

-  lfi->mode_lf_lut[D153_PRED] = 1;

-  lfi->mode_lf_lut[D27_PRED] = 1;

-  lfi->mode_lf_lut[D63_PRED] = 1;

-  lfi->mode_lf_lut[V_PRED] = 1;

-  lfi->mode_lf_lut[H_PRED] = 1;

-  lfi->mode_lf_lut[TM_PRED] = 1;

-  lfi->mode_lf_lut[B_PRED]  = 0;

-  lfi->mode_lf_lut[I8X8_PRED] = 0;

-  lfi->mode_lf_lut[ZEROMV]  = 1;

-  lfi->mode_lf_lut[NEARESTMV] = 2;

-  lfi->mode_lf_lut[NEARMV] = 2;

-  lfi->mode_lf_lut[NEWMV] = 2;

-  lfi->mode_lf_lut[SPLITMV] = 3;

+  lfi->mode_lf_lut[DC_PRED] = 0;

+  lfi->mode_lf_lut[D45_PRED] = 0;

+  lfi->mode_lf_lut[D135_PRED] = 0;

+  lfi->mode_lf_lut[D117_PRED] = 0;

+  lfi->mode_lf_lut[D153_PRED] = 0;

+  lfi->mode_lf_lut[D27_PRED] = 0;

+  lfi->mode_lf_lut[D63_PRED] = 0;

+  lfi->mode_lf_lut[V_PRED] = 0;

+  lfi->mode_lf_lut[H_PRED] = 0;

+  lfi->mode_lf_lut[TM_PRED] = 0;

+  lfi->mode_lf_lut[ZEROMV]  = 0;

+  lfi->mode_lf_lut[NEARESTMV] = 1;

+  lfi->mode_lf_lut[NEARMV] = 1;

+  lfi->mode_lf_lut[NEWMV] = 1;

 void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,

@@ -86,25 +66,28 @@

   loop_filter_info_n *lfi = &cm->lf_info;

   int i;

-  /* init limits for given sharpness*/

+  // init limits for given sharpness

   vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);

   cm->last_sharpness_level = cm->sharpness_level;

-  /* init LUT for lvl  and hev thr picking */

+  // init LUT for lvl  and hev thr picking

   lf_init_lut(lfi);

-  /* init hev threshold const vectors */

-  for (i = 0; i < 4; i++) {

+  // init hev threshold const vectors

+  for (i = 0; i < 4; i++)

     vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);

-  }

 void vp9_loop_filter_frame_init(VP9_COMMON *cm,

                                 MACROBLOCKD *xd,

                                 int default_filt_lvl) {

-  int seg,  /* segment number */

-      ref,  /* index in ref_lf_deltas */

-      mode; /* index in mode_lf_deltas */

+  int seg,    // segment number

+      ref,    // index in ref_lf_deltas

+      mode;   // index in mode_lf_deltas

+  // n_shift is the a multiplier for lf_deltas

+  // the multiplier is 1 for when filter_lvl is between 0 and 31;

+  // 2 when filter_lvl is between 32 and 63

+  int n_shift = default_filt_lvl >> 5;

   loop_filter_info_n *lfi = &cm->lf_info;

@@ -147,360 +130,278 @@

     ref = INTRA_FRAME;

     /* Apply delta for reference frame */

-    lvl_ref += xd->ref_lf_deltas[ref];

+    lvl_ref += xd->ref_lf_deltas[ref] << n_shift;

-    /* Apply delta for Intra modes */

-    mode = 0; /* B_PRED */

-    /* Only the split mode BPRED has a further special case */

-    lvl_mode = clamp(lvl_ref +  xd->mode_lf_deltas[mode], 0, 63);

+    mode = 0; /* all the rest of Intra modes */

+    lvl_mode = lvl_ref;

+    lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);

-    lfi->lvl[seg][ref][mode] = lvl_mode;

-    mode = 1; /* all the rest of Intra modes */

-    lvl_mode = clamp(lvl_ref, 0, 63);

-    lfi->lvl[seg][ref][mode] = lvl_mode;

     /* LAST, GOLDEN, ALT */

     for (ref = 1; ref < MAX_REF_FRAMES; ref++) {

       int lvl_ref = lvl_seg;

       /* Apply delta for reference frame */

-      lvl_ref += xd->ref_lf_deltas[ref];

+      lvl_ref += xd->ref_lf_deltas[ref] << n_shift;

       /* Apply delta for Inter modes */

-      for (mode = 1; mode < 4; mode++) {

-        lvl_mode = clamp(lvl_ref + xd->mode_lf_deltas[mode], 0, 63);

-        lfi->lvl[seg][ref][mode] = lvl_mode;

+      for (mode = 0; mode < MAX_MODE_LF_DELTAS; mode++) {

+        lvl_mode = lvl_ref + (xd->mode_lf_deltas[mode] << n_shift);

+        lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);

-// Determine if we should skip inner-MB loop filtering within a MB

-// The current condition is that the loop filtering is skipped only

-// the MB uses a prediction size of 16x16 and either 16x16 transform

-// is used or there is no residue at all.

-static int mb_lf_skip(const MB_MODE_INFO *const mbmi) {

-  const MB_PREDICTION_MODE mode = mbmi->mode;

-  const int skip_coef = mbmi->mb_skip_coeff;

-  const int tx_size = mbmi->txfm_size;

-  return mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV &&

-         (tx_size >= TX_16X16 || skip_coef);

-}

+static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi,

+                      struct loop_filter_info *lfi) {

+  const loop_filter_info_n *lfi_n = &cm->lf_info;

+  int mode = mbmi->mode;

+  int mode_index = lfi_n->mode_lf_lut[mode];

+  int seg = mbmi->segment_id;

+  int ref_frame = mbmi->ref_frame[0];

+  int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

-// Determine if we should skip MB loop filtering on a MB edge within

-// a superblock, the current condition is that MB loop filtering is

-// skipped only when both MBs do not use inner MB loop filtering, and

-// same motion vector with same reference frame

-static int sb_mb_lf_skip(const MODE_INFO *const mip0,

-                         const MODE_INFO *const mip1) {

-  const MB_MODE_INFO *mbmi0 = &mip0->mbmi;

-  const MB_MODE_INFO *mbmi1 = &mip0->mbmi;

-  return mb_lf_skip(mbmi0) && mb_lf_skip(mbmi1) &&

-         (mbmi0->ref_frame == mbmi1->ref_frame) &&

-         (mbmi0->mv[mbmi0->ref_frame].as_int ==

-          mbmi1->mv[mbmi1->ref_frame].as_int) &&

-         mbmi0->ref_frame != INTRA_FRAME;

+  if (filter_level) {

+    const int hev_index = filter_level >> 4;

+    lfi->mblim = lfi_n->mblim[filter_level];

+    lfi->blim = lfi_n->blim[filter_level];

+    lfi->lim = lfi_n->lim[filter_level];

+    lfi->hev_thr = lfi_n->hev_thr[hev_index];

+    return 1;

+  }

+  return 0;

-void vp9_loop_filter_frame(VP9_COMMON *cm,

-                           MACROBLOCKD *xd,

-                           int frame_filter_level,

-                           int y_only,

-                           int dering) {

-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

-  loop_filter_info_n *lfi_n = &cm->lf_info;

-  struct loop_filter_info lfi;

-  const FRAME_TYPE frame_type = cm->frame_type;

-  int mb_row, mb_col;

-  uint8_t *y_ptr, *u_ptr, *v_ptr;

+static void filter_selectively_vert(uint8_t *s, int pitch,

+                                    unsigned int mask_16x16,

+                                    unsigned int mask_8x8,

+                                    unsigned int mask_4x4,

+                                    unsigned int mask_4x4_int,

+                                    const struct loop_filter_info *lfi) {

+  unsigned int mask;

-  /* Point at base of Mb MODE_INFO list */

-  const MODE_INFO *mode_info_context = cm->mi;

-  const int mis = cm->mode_info_stride;

+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {

+    if (mask & 1) {

+      if (mask_16x16 & 1) {

+        vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,

+                                   lfi->hev_thr, 1);

+        assert(!(mask_8x8 & 1));

+        assert(!(mask_4x4 & 1));

+        assert(!(mask_4x4_int & 1));

+      } else if (mask_8x8 & 1) {

+        vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,

+                                        lfi->hev_thr, 1);

+        assert(!(mask_16x16 & 1));

+        assert(!(mask_4x4 & 1));

+      } else if (mask_4x4 & 1) {

+        vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,

+                                      lfi->hev_thr, 1);

+        assert(!(mask_16x16 & 1));

+        assert(!(mask_8x8 & 1));

+      } else {

+        assert(0);

+      }

-  /* Initialize the loop filter for this frame. */

-  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);

-  /* Set up the buffer pointers */

-  y_ptr = post->y_buffer;

-  if (y_only) {

-    u_ptr = 0;

-    v_ptr = 0;

-  } else {

-    u_ptr = post->u_buffer;

-    v_ptr = post->v_buffer;

+      if (mask_4x4_int & 1)

+        vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,

+                                      lfi->hev_thr, 1);

+    }

+    s += 8;

+    lfi++;

+    mask_16x16 >>= 1;

+    mask_8x8 >>= 1;

+    mask_4x4 >>= 1;

+    mask_4x4_int >>= 1;

+}

-  /* vp9_filter each macro block */

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-      const MB_PREDICTION_MODE mode = mode_info_context->mbmi.mode;

-      const int mode_index = lfi_n->mode_lf_lut[mode];

-      const int seg = mode_info_context->mbmi.segment_id;

-      const int ref_frame = mode_info_context->mbmi.ref_frame;

-      const int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

-      if (filter_level) {

-        const int skip_lf = mb_lf_skip(&mode_info_context->mbmi);

-        const int tx_size = mode_info_context->mbmi.txfm_size;

-        if (cm->filter_type == NORMAL_LOOPFILTER) {

-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

-          lfi.mblim = lfi_n->mblim[filter_level];

-          lfi.blim = lfi_n->blim[filter_level];

-          lfi.lim = lfi_n->lim[filter_level];

-          lfi.hev_thr = lfi_n->hev_thr[hev_index];

+static void filter_selectively_horiz(uint8_t *s, int pitch,

+                                     unsigned int mask_16x16,

+                                     unsigned int mask_8x8,

+                                     unsigned int mask_4x4,

+                                     unsigned int mask_4x4_int,

+                                     int only_4x4_1,

+                                     const struct loop_filter_info *lfi) {

+  unsigned int mask;

-          if (mb_col > 0 &&

-              !((mb_col & 1) && mode_info_context->mbmi.sb_type &&

-                (sb_mb_lf_skip(mode_info_context - 1, mode_info_context) ||

-                 tx_size >= TX_32X32))

-              ) {

-            if (tx_size >= TX_16X16)

-              vp9_lpf_mbv_w(y_ptr, u_ptr, v_ptr, post->y_stride,

-                            post->uv_stride, &lfi);

-            else

-              vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                  post->uv_stride, &lfi);

-          }

-          if (!skip_lf) {

-            if (tx_size >= TX_8X8) {

-              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))

-                vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                      post->uv_stride, &lfi);

-              else

-                vp9_loop_filter_bv8x8(y_ptr, NULL, NULL, post->y_stride,

-                                      post->uv_stride, &lfi);

-            } else {

-              vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                 post->uv_stride, &lfi);

-            }

-          }

-          /* don't apply across umv border */

-          if (mb_row > 0 &&

-              !((mb_row & 1) && mode_info_context->mbmi.sb_type &&

-                (sb_mb_lf_skip(mode_info_context - mis, mode_info_context) ||

-                tx_size >= TX_32X32))

-              ) {

-            if (tx_size >= TX_16X16)

-              vp9_lpf_mbh_w(y_ptr, u_ptr, v_ptr, post->y_stride,

-                            post->uv_stride, &lfi);

-            else

-              vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                  post->uv_stride, &lfi);

-          }

-          if (!skip_lf) {

-            if (tx_size >= TX_8X8) {

-              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))

-                vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                      post->uv_stride, &lfi);

-              else

-                vp9_loop_filter_bh8x8(y_ptr, NULL, NULL, post->y_stride,

-                                      post->uv_stride, &lfi);

-            } else {

-              vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                 post->uv_stride, &lfi);

-            }

-          }

-#if CONFIG_LOOP_DERING

-          if (dering) {

-            if (mb_row && mb_row < cm->mb_rows - 1 &&

-                mb_col && mb_col < cm->mb_cols - 1) {

-              vp9_post_proc_down_and_across(y_ptr, y_ptr,

-                                            post->y_stride, post->y_stride,

-                                            16, 16, dering);

-              if (!y_only) {

-                vp9_post_proc_down_and_across(u_ptr, u_ptr,

-                                              post->uv_stride, post->uv_stride,

-                                              8, 8, dering);

-                vp9_post_proc_down_and_across(v_ptr, v_ptr,

-                                              post->uv_stride, post->uv_stride,

-                                              8, 8, dering);

-              }

-            } else {

-              // Adjust the filter so that no out-of-frame data is used.

-              uint8_t *dr_y = y_ptr, *dr_u = u_ptr, *dr_v = v_ptr;

-              int w_adjust = 0;

-              int h_adjust = 0;

-              if (mb_col == 0) {

-                dr_y += 2;

-                dr_u += 2;

-                dr_v += 2;

-                w_adjust += 2;

-              }

-              if (mb_col == cm->mb_cols - 1)

-                w_adjust += 2;

-              if (mb_row == 0) {

-                dr_y += 2 * post->y_stride;

-                dr_u += 2 * post->uv_stride;

-                dr_v += 2 * post->uv_stride;

-                h_adjust += 2;

-              }

-              if (mb_row == cm->mb_rows - 1)

-                h_adjust += 2;

-              vp9_post_proc_down_and_across_c(dr_y, dr_y,

-                                              post->y_stride, post->y_stride,

-                                              16 - w_adjust, 16 - h_adjust,

-                                              dering);

-              if (!y_only) {

-                vp9_post_proc_down_and_across_c(dr_u, dr_u,

-                                                post->uv_stride,

-                                                post->uv_stride,

-                                                8 - w_adjust, 8 - h_adjust,

-                                                dering);

-                vp9_post_proc_down_and_across_c(dr_v, dr_v,

-                                                post->uv_stride,

-                                                post->uv_stride,

-                                                8 - w_adjust, 8 - h_adjust,

-                                                dering);

-              }

-            }

-          }

-#endif

+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {

+    if (mask & 1) {

+      if (!only_4x4_1) {

+        if (mask_16x16 & 1) {

+          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,

+                                       lfi->hev_thr, 1);

+          assert(!(mask_8x8 & 1));

+          assert(!(mask_4x4 & 1));

+          assert(!(mask_4x4_int & 1));

+        } else if (mask_8x8 & 1) {

+          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,

+                                            lfi->hev_thr, 1);

+          assert(!(mask_16x16 & 1));

+          assert(!(mask_4x4 & 1));

+        } else if (mask_4x4 & 1) {

+          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,

+                                          lfi->hev_thr, 1);

+          assert(!(mask_16x16 & 1));

+          assert(!(mask_8x8 & 1));

         } else {

-          // FIXME: Not 8x8 aware

-          if (mb_col > 0 &&

-              !(skip_lf && mb_lf_skip(&mode_info_context[-1].mbmi)) &&

-              !((mb_col & 1) && mode_info_context->mbmi.sb_type))

-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,

-                                       lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-          /* don't apply across umv border */

-          if (mb_row > 0 &&

-              !(skip_lf && mb_lf_skip(&mode_info_context[-mis].mbmi)) &&

-              !((mb_row & 1) && mode_info_context->mbmi.sb_type))

-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

-                                       lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

+          assert(0);

-      y_ptr += 16;

-      if (!y_only) {

-        u_ptr += 8;

-        v_ptr += 8;

-      }

-      mode_info_context++;     /* step to next MB */

+      if (mask_4x4_int & 1)

+        vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,

+                                        lfi->lim, lfi->hev_thr, 1);

-    y_ptr += post->y_stride  * 16 - post->y_width;

-    if (!y_only) {

-      u_ptr += post->uv_stride *  8 - post->uv_width;

-      v_ptr += post->uv_stride *  8 - post->uv_width;

-    }

-    mode_info_context++;         /* Skip border mb */

+    s += 8;

+    lfi++;

+    mask_16x16 >>= 1;

+    mask_8x8 >>= 1;

+    mask_4x4 >>= 1;

+    mask_4x4_int >>= 1;

+static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,

+                               int plane, int mi_row, int mi_col) {

+  const int ss_x = xd->plane[plane].subsampling_x;

+  const int ss_y = xd->plane[plane].subsampling_y;

+  const int row_step = 1 << xd->plane[plane].subsampling_y;

+  const int col_step = 1 << xd->plane[plane].subsampling_x;

+  struct buf_2d * const dst = &xd->plane[plane].dst;

+  uint8_t* const dst0 = dst->buf;

+  MODE_INFO* const mi0 = xd->mode_info_context;

+  unsigned int mask_16x16[64 / MI_SIZE] = {0};

+  unsigned int mask_8x8[64 / MI_SIZE] = {0};

+  unsigned int mask_4x4[64 / MI_SIZE] = {0};

+  unsigned int mask_4x4_int[64 / MI_SIZE] = {0};

+  struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE];

+  int r, c;

-void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,

-                                   int default_filt_lvl) {

-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

+  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {

+    unsigned int mask_16x16_c = 0;

+    unsigned int mask_8x8_c = 0;

+    unsigned int mask_4x4_c = 0;

+    unsigned int border_mask;

-  uint8_t *y_ptr;

-  int mb_row;

-  int mb_col;

-  int mb_cols = post->y_width  >> 4;

+    // Determine the vertical edges that need filtering

+    for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) {

+      const MODE_INFO const *mi = xd->mode_info_context;

+      const int skip_this = mi[c].mbmi.mb_skip_coeff

+                            && mi[c].mbmi.ref_frame != INTRA_FRAME;

+      // left edge of current unit is block/partition edge -> no skip

+      const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?

+          !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;

+      const int skip_this_c = skip_this && !block_edge_left;

+      // top edge of current unit is block/partition edge -> no skip

+      const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?

+          !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;

+      const int skip_this_r = skip_this && !block_edge_above;

+      const TX_SIZE tx_size = plane ? get_uv_tx_size(&mi[c].mbmi)

+                                    : mi[c].mbmi.txfm_size;

+      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;

+      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

-  int linestocopy, i;

+      // Filter level can vary per MI

+      if (!build_lfi(cm, &mi[c].mbmi,

+                     lfi[r] + (c >> xd->plane[plane].subsampling_x)))

+        continue;

-  loop_filter_info_n *lfi_n = &cm->lf_info;

-  struct loop_filter_info lfi;

+      // Build masks based on the transform size of each block

+      if (tx_size == TX_32X32) {

+        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {

+          if (!skip_border_4x4_c)

+            mask_16x16_c |= 1 << (c >> ss_x);

+          else

+            mask_8x8_c |= 1 << (c >> ss_x);

+        }

+        if (!skip_this_r && ((r >> ss_y) & 3) == 0) {

+          if (!skip_border_4x4_r)

+            mask_16x16[r] |= 1 << (c >> ss_x);

+          else

+            mask_8x8[r] |= 1 << (c >> ss_x);

+        }

+      } else if (tx_size == TX_16X16) {

+        if (!skip_this_c && ((c >> ss_x) & 1) == 0) {

+          if (!skip_border_4x4_c)

+            mask_16x16_c |= 1 << (c >> ss_x);

+          else

+            mask_8x8_c |= 1 << (c >> ss_x);

+        }

+        if (!skip_this_r && ((r >> ss_y) & 1) == 0) {

+          if (!skip_border_4x4_r)

+            mask_16x16[r] |= 1 << (c >> ss_x);

+          else

+            mask_8x8[r] |= 1 << (c >> ss_x);

+        }

+      } else {

+        // force 8x8 filtering on 32x32 boundaries

+        if (!skip_this_c) {

+          if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0)

+            mask_8x8_c |= 1 << (c >> ss_x);

+          else

+            mask_4x4_c |= 1 << (c >> ss_x);

+        }

-  int filter_level;

-  int alt_flt_enabled = xd->segmentation_enabled;

-  FRAME_TYPE frame_type = cm->frame_type;

+        if (!skip_this_r) {

+          if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0)

+            mask_8x8[r] |= 1 << (c >> ss_x);

+          else

+            mask_4x4[r] |= 1 << (c >> ss_x);

+        }

-  const MODE_INFO *mode_info_context;

-  int lvl_seg[MAX_MB_SEGMENTS];

-  mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);

-  /* 3 is a magic number. 4 is probably magic too */

-  linestocopy = (post->y_height >> (4 + 3));

-  if (linestocopy < 1)

-    linestocopy = 1;

-  linestocopy <<= 4;

-  /* Note the baseline filter values for each segment */

-  /* See vp9_loop_filter_frame_init. Rather than call that for each change

-   * to default_filt_lvl, copy the relevant calculation here.

-   */

-  if (alt_flt_enabled) {

-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {

-        // Abs value

-        lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

-      } else {

-        // Delta Value

-        lvl_seg[i] = default_filt_lvl + vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

-        lvl_seg[i] = clamp(lvl_seg[i], 0, 63);

+        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)

+          mask_4x4_int[r] |= 1 << (c >> ss_x);

+    // Disable filtering on the leftmost column

+    border_mask = ~(mi_col == 0);

+    filter_selectively_vert(dst->buf, dst->stride,

+                            mask_16x16_c & border_mask,

+                            mask_8x8_c & border_mask,

+                            mask_4x4_c & border_mask,

+                            mask_4x4_int[r], lfi[r]);

+    dst->buf += 8 * dst->stride;

+    xd->mode_info_context += cm->mode_info_stride * row_step;

-  /* Set up the buffer pointers */

-  y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;

+  // Now do horizontal pass

+  dst->buf = dst0;

+  xd->mode_info_context = mi0;

+  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {

+    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];

-  /* vp9_filter each macro block */

-  for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {

-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {

-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&

-                     mode_info_context->mbmi.mode != I8X8_PRED &&

-                     mode_info_context->mbmi.mode != SPLITMV &&

-                     mode_info_context->mbmi.mb_skip_coeff);

+    filter_selectively_horiz(dst->buf, dst->stride,

+                             mask_16x16[r],

+                             mask_8x8[r],

+                             mask_4x4[r],

+                             mask_4x4_int_r, mi_row + r == 0, lfi[r]);

+    dst->buf += 8 * dst->stride;

+    xd->mode_info_context += cm->mode_info_stride * row_step;

+  }

+}

-      if (alt_flt_enabled)

-        filter_level = lvl_seg[mode_info_context->mbmi.segment_id];

-      else

-        filter_level = default_filt_lvl;

+void vp9_loop_filter_frame(VP9_COMMON *cm,

+                           MACROBLOCKD *xd,

+                           int frame_filter_level,

+                           int y_only) {

+  int mi_row, mi_col;

-      if (filter_level) {

-        if (cm->filter_type == NORMAL_LOOPFILTER) {

-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

-          lfi.mblim = lfi_n->mblim[filter_level];

-          lfi.blim = lfi_n->blim[filter_level];

-          lfi.lim = lfi_n->lim[filter_level];

-          lfi.hev_thr = lfi_n->hev_thr[hev_index];

+  // Initialize the loop filter for this frame.

+  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);

-          if (mb_col > 0)

-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) {

+    MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;

-          if (!skip_lf)

-            vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) {

+      int plane;

-          vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          if (!skip_lf)

-            vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-        } else {

-          if (mb_col > 0)

-            vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,

-                                        lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-          vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

-                                     lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-        }

+      setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col);

+      for (plane = 0; plane < (y_only ? 1 : MAX_MB_PLANE); plane++) {

+        xd->mode_info_context = mi + mi_col;

+        filter_block_plane(cm, xd, plane, mi_row, mi_col);

-      y_ptr += 16;

-      mode_info_context += 1;      /* step to next MB */

-    y_ptr += post->y_stride  * 16 - post->y_width;

-    mode_info_context += 1;          /* Skip border mb */

--- a/vp9/common/vp9_loopfilter.h

+++ b/vp9/common/vp9_loopfilter.h

@@ -16,12 +16,6 @@

 #include "vp9/common/vp9_blockd.h"

 #define MAX_LOOP_FILTER 63

-typedef enum {

-  NORMAL_LOOPFILTER = 0,

-  SIMPLE_LOOPFILTER = 1

-} LOOPFILTERTYPE;

 #define SIMD_WIDTH 16

 /* Need to align this structure so when it is declared and

@@ -36,8 +30,7 @@

                   lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);

   DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

                   hev_thr[4][SIMD_WIDTH]);

-  unsigned char lvl[4][4][4];

-  unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];

+  unsigned char lvl[MAX_MB_SEGMENTS][4][4];

   unsigned char mode_lf_lut[MB_MODE_COUNT];

 } loop_filter_info_n;

@@ -56,9 +49,6 @@

   void sym(uint8_t *y, uint8_t *u, uint8_t *v, \

            int ystride, int uv_stride, struct loop_filter_info *lfi)

-#define prototype_simple_loopfilter(sym) \

-  void sym(uint8_t *y, int ystride, const unsigned char *blimit)

 #if ARCH_X86 || ARCH_X86_64

 #include "x86/vp9_loopfilter_x86.h"

 #endif

@@ -83,8 +73,7 @@

 void vp9_loop_filter_frame(struct VP9Common *cm,

                            struct macroblockd *mbd,

                            int filter_level,

-                           int y_only,

-                           int dering);

+                           int y_only);

 void vp9_loop_filter_partial_frame(struct VP9Common *cm,

                                    struct macroblockd *mbd,

--- a/vp9/common/vp9_loopfilter_filters.c

+++ b/vp9/common/vp9_loopfilter_filters.c

@@ -8,19 +8,16 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include <stdlib.h>

 #include "vpx_config.h"

+#include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_loopfilter.h"

 #include "vp9/common/vp9_onyxc_int.h"

 static INLINE int8_t signed_char_clamp(int t) {

-  t = (t < -128 ? -128 : t);

-  t = (t > 127 ? 127 : t);

-  return (int8_t) t;

+  return (int8_t)clamp(t, -128, 127);

-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */

+// should we apply any filter at all: 11111111 yes, 00000000 no

 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,

                                  uint8_t p3, uint8_t p2,

                                  uint8_t p1, uint8_t p0,

@@ -34,11 +31,10 @@

   mask |= (abs(q2 - q1) > limit) * -1;

   mask |= (abs(q3 - q2) > limit) * -1;

   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

-  mask = ~mask;

-  return mask;

+  return ~mask;

-/* is there high variance internal edge ( 11111111 yes, 00000000 no) */

+// is there high edge variance internal edge: 11111111 yes, 00000000 no

 static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,

                              uint8_t q0, uint8_t q1) {

   int8_t hev = 0;

@@ -70,73 +66,59 @@

   *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;

   *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

-  filter = filter1;

   // outer tap adjustments

-  filter += 1;

-  filter >>= 1;

-  filter &= ~hev;

+  filter = ((filter1 + 1) >> 1) & ~hev;

   *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;

   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;

-void vp9_loop_filter_horizontal_edge_c(uint8_t *s,

-                                       int p, /* pitch */

-                                       const unsigned char *blimit,

-                                       const unsigned char *limit,

-                                       const unsigned char *thresh,

+void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,

+                                       const uint8_t *blimit,

+                                       const uint8_t *limit,

+                                       const uint8_t *thresh,

                                        int count) {

-  int hev = 0; /* high edge variance */

-  int8_t mask = 0;

-  int i = 0;

+  int i;

-  /* loop filter designed to work using chars so that we can make maximum use

-   * of 8 bit simd instructions.

-   */

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                       s[0 * p], s[1 * p], s[2 * p], s[3 * p]);

-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

+  // loop filter designed to work using chars so that we can make maximum use

+  // of 8 bit simd instructions.

+  for (i = 0; i < 8 * count; ++i) {

+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

+    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];

+    const int8_t mask = filter_mask(*limit, *blimit,

+                                    p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);

     filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);

     ++s;

-  } while (++i < count * 8);

+  }

-void vp9_loop_filter_vertical_edge_c(uint8_t *s,

-                                     int p,

-                                     const unsigned char *blimit,

-                                     const unsigned char *limit,

-                                     const unsigned char *thresh,

+void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,

+                                     const uint8_t *blimit,

+                                     const uint8_t *limit,

+                                     const uint8_t *thresh,

                                      int count) {

-  int  hev = 0; /* high edge variance */

-  int8_t mask = 0;

-  int i = 0;

+  int i;

-  /* loop filter designed to work using chars so that we can make maximum use

-   * of 8 bit simd instructions.

-   */

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4], s[-3], s[-2], s[-1],

-                       s[0], s[1], s[2], s[3]);

-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

+  // loop filter designed to work using chars so that we can make maximum use

+  // of 8 bit simd instructions.

+  for (i = 0; i < 8 * count; ++i) {

+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

+    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];

+    const int8_t mask = filter_mask(*limit, *blimit,

+                                    p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);

     filter(mask, hev, s - 2, s - 1, s, s + 1);

-    s += p;

-  } while (++i < count * 8);

+    s += pitch;

+  }

-static INLINE signed char flatmask4(uint8_t thresh,

-                                    uint8_t p3, uint8_t p2,

-                                    uint8_t p1, uint8_t p0,

-                                    uint8_t q0, uint8_t q1,

-                                    uint8_t q2, uint8_t q3) {

+static INLINE int8_t flatmask4(uint8_t thresh,

+                               uint8_t p3, uint8_t p2,

+                               uint8_t p1, uint8_t p0,

+                               uint8_t q0, uint8_t q1,

+                               uint8_t q2, uint8_t q3) {

   int8_t flat = 0;

   flat |= (abs(p1 - p0) > thresh) * -1;

   flat |= (abs(q1 - q0) > thresh) * -1;

@@ -144,8 +126,7 @@

   flat |= (abs(q0 - q2) > thresh) * -1;

   flat |= (abs(p3 - p0) > thresh) * -1;

   flat |= (abs(q3 - q0) > thresh) * -1;

-  flat = ~flat;

-  return flat;

+  return ~flat;

 static INLINE signed char flatmask5(uint8_t thresh,

                                     uint8_t p4, uint8_t p3, uint8_t p2,

@@ -167,289 +148,64 @@

                             uint8_t *oq2, uint8_t *oq3) {

   // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line

   if (flat && mask) {

-    const uint8_t p3 = *op3;

-    const uint8_t p2 = *op2;

-    const uint8_t p1 = *op1;

-    const uint8_t p0 = *op0;

-    const uint8_t q0 = *oq0;

-    const uint8_t q1 = *oq1;

-    const uint8_t q2 = *oq2;

-    const uint8_t q3 = *oq3;

+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;

+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

-    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

-    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

-    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;

-    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;

-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;

-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;

+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);

+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);

+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);

+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);

+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);

+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);

   } else {

-    int8_t filter1, filter2;

-    const int8_t ps1 = (int8_t) *op1 ^ 0x80;

-    const int8_t ps0 = (int8_t) *op0 ^ 0x80;

-    const int8_t qs0 = (int8_t) *oq0 ^ 0x80;

-    const int8_t qs1 = (int8_t) *oq1 ^ 0x80;

-    // add outer taps if we have high edge variance

-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;

-    // inner taps

-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;

-    filter1 = signed_char_clamp(filter + 4) >> 3;

-    filter2 = signed_char_clamp(filter + 3) >> 3;

-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;

-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

-    filter = filter1;

-    // outer tap adjustments

-    filter += 1;

-    filter >>= 1;

-    filter &= ~hev;

-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;

-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;

+    filter(mask, hev, op1,  op0, oq0, oq1);

-void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s,

-                                         int p,

-                                         const unsigned char *blimit,

-                                         const unsigned char *limit,

-                                         const unsigned char *thresh,

+void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,

+                                         const uint8_t *blimit,

+                                         const uint8_t *limit,

+                                         const uint8_t *thresh,

                                          int count) {

-  int8_t hev = 0; /* high edge variance */

-  int8_t mask = 0;

-  int8_t flat = 0;

-  int i = 0;

+  int i;

-  /* loop filter designed to work using chars so that we can make maximum use

-   * of 8 bit simd instructions.

-   */

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

+  // loop filter designed to work using chars so that we can make maximum use

+  // of 8 bit simd instructions.

+  for (i = 0; i < 8 * count; ++i) {

+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

-    flat = flatmask4(1, s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                        s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

+    const int8_t mask = filter_mask(*limit, *blimit,

+                                    p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);

+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

     mbfilter(mask, hev, flat,

              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

              s,         s + 1 * p, s + 2 * p, s + 3 * p);

     ++s;

-  } while (++i < count * 8);

+  }

-void vp9_mbloop_filter_vertical_edge_c(uint8_t *s,

-                                       int p,

-                                       const unsigned char *blimit,

-                                       const unsigned char *limit,

-                                       const unsigned char *thresh,

+void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,

+                                       const uint8_t *blimit,

+                                       const uint8_t *limit,

+                                       const uint8_t *thresh,

                                        int count) {

-  int8_t hev = 0; /* high edge variance */

-  int8_t mask = 0;

-  int8_t flat = 0;

-  int i = 0;

+  int i;

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4], s[-3], s[-2], s[-1],

-                       s[0], s[1], s[2], s[3]);

-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

-    flat = flatmask4(1,

-                    s[-4], s[-3], s[-2], s[-1],

-                    s[ 0], s[ 1], s[ 2], s[ 3]);

-    mbfilter(mask, hev, flat,

-             s - 4, s - 3, s - 2, s - 1,

-             s,     s + 1, s + 2, s + 3);

-    s += p;

-  } while (++i < count * 8);

+  for (i = 0; i < 8 * count; ++i) {

+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];

+    const int8_t mask = filter_mask(*limit, *blimit,

+                                    p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);

+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

+    mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,

+                              s,     s + 1, s + 2, s + 3);

+    s += pitch;

+  }

-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */

-static INLINE int8_t simple_filter_mask(uint8_t blimit,

-                                        uint8_t p1, uint8_t p0,

-                                        uint8_t q0, uint8_t q1) {

-  return (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;

-}

-static INLINE void simple_filter(int8_t mask,

-                                 uint8_t *op1, uint8_t *op0,

-                                 uint8_t *oq0, uint8_t *oq1) {

-  int8_t filter1, filter2;

-  const int8_t p1 = (int8_t) *op1 ^ 0x80;

-  const int8_t p0 = (int8_t) *op0 ^ 0x80;

-  const int8_t q0 = (int8_t) *oq0 ^ 0x80;

-  const int8_t q1 = (int8_t) *oq1 ^ 0x80;

-  int8_t filter = signed_char_clamp(p1 - q1);

-  filter = signed_char_clamp(filter + 3 * (q0 - p0));

-  filter &= mask;

-  // save bottom 3 bits so that we round one side +4 and the other +3

-  filter1 = signed_char_clamp(filter + 4) >> 3;

-  *oq0  = signed_char_clamp(q0 - filter1) ^ 0x80;

-  filter2 = signed_char_clamp(filter + 3) >> 3;

-  *op0 = signed_char_clamp(p0 + filter2) ^ 0x80;

-}

-void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s,

-                                              int p,

-                                              const unsigned char *blimit) {

-  int8_t mask = 0;

-  int i = 0;

-  do {

-    mask = simple_filter_mask(blimit[0],

-                              s[-2 * p], s[-1 * p],

-                              s[0 * p], s[1 * p]);

-    simple_filter(mask,

-                  s - 2 * p, s - 1 * p,

-                  s, s + 1 * p);

-    ++s;

-  } while (++i < 16);

-}

-void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s,

-                                            int p,

-                                            const unsigned char *blimit) {

-  int8_t mask = 0;

-  int i = 0;

-  do {

-    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);

-    simple_filter(mask, s - 2, s - 1, s, s + 1);

-    s += p;

-  } while (++i < 16);

-}

-/* Vertical MB Filtering */

-void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr,

-                           uint8_t *v_ptr, int y_stride, int uv_stride,

-                           struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,

-                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Vertical B Filtering */

-void vp9_loop_filter_bv_c(uint8_t*y_ptr, uint8_t *u_ptr,

-                          uint8_t *v_ptr, int y_stride, int uv_stride,

-                          struct loop_filter_info *lfi) {

-  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,

-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,

-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,

-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Horizontal MB filtering */

-void vp9_loop_filter_mbh_c(uint8_t *y_ptr, uint8_t *u_ptr,

-                           uint8_t *v_ptr, int y_stride, int uv_stride,

-                           struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,

-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,

-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Horizontal B Filtering */

-void vp9_loop_filter_bh_c(uint8_t *y_ptr, uint8_t *u_ptr,

-                          uint8_t *v_ptr, int y_stride, int uv_stride,

-                          struct loop_filter_info *lfi) {

-  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bh8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,

-                             uint8_t *v_ptr, int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_horizontal_edge_c(

-    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bhs_c(uint8_t *y_ptr, int y_stride,

-                           const unsigned char *blimit) {

-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,

-                                           y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,

-                                           y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,

-                                           y_stride, blimit);

-}

-void vp9_loop_filter_bv8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,

-                             uint8_t *v_ptr, int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_vertical_edge_c(

-    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bvs_c(uint8_t *y_ptr, int y_stride,

-                           const unsigned char *blimit) {

-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);

-}

 static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,

                                  uint8_t flat, uint8_t flat2,

                                  uint8_t *op7, uint8_t *op6, uint8_t *op5,

@@ -460,130 +216,65 @@

                                  uint8_t *oq7) {

   // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line

   if (flat2 && flat && mask) {

-    const uint8_t p7 = *op7;

-    const uint8_t p6 = *op6;

-    const uint8_t p5 = *op5;

-    const uint8_t p4 = *op4;

-    const uint8_t p3 = *op3;

-    const uint8_t p2 = *op2;

-    const uint8_t p1 = *op1;

-    const uint8_t p0 = *op0;

-    const uint8_t q0 = *oq0;

-    const uint8_t q1 = *oq1;

-    const uint8_t q2 = *oq2;

-    const uint8_t q3 = *oq3;

-    const uint8_t q4 = *oq4;

-    const uint8_t q5 = *oq5;

-    const uint8_t q6 = *oq6;

-    const uint8_t q7 = *oq7;

+    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,

+                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;

-    *op6 = (p7 * 7 + p6 * 2 +

-            p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;

-    *op5 = (p7 * 6 + p6 + p5 * 2 +

-            p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;

-    *op4 = (p7 * 5 + p6 + p5 + p4 * 2 +

-            p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;

-    *op3 = (p7 * 4 + p6 + p5 + p4 + p3 * 2 +

-            p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;

-    *op2 = (p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 +

-            p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;

-    *op1 = (p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +

-            p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;

-    *op0 = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +

-            q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;

-    *oq0 = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +

-            q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;

-    *oq1 = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +

-            q2 + q3 + q4 + q5 + q6 + q7 * 2 + 8) >> 4;

-    *oq2 = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +

-            q3 + q4 + q5 + q6 + q7 * 3 + 8) >> 4;

-    *oq3 = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +

-            q4 + q5 + q6 + q7 * 4 + 8) >> 4;

-    *oq4 = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +

-            q5 + q6 + q7 * 5 + 8) >> 4;

-    *oq5 = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +

-            q6 + q7 * 6 + 8) >> 4;

-    *oq6 = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +

-            q7 * 7 + 8) >> 4;

-  } else if (flat && mask) {

-    const uint8_t p3 = *op3;

-    const uint8_t p2 = *op2;

-    const uint8_t p1 = *op1;

-    const uint8_t p0 = *op0;

-    const uint8_t q0 = *oq0;

-    const uint8_t q1 = *oq1;

-    const uint8_t q2 = *oq2;

-    const uint8_t q3 = *oq3;

+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,

+                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;

-    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

-    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

-    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;

-    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;

-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;

-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;

+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +

+                              q0, 4);

+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +

+                              q0 + q1, 4);

+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +

+                              q0 + q1 + q2, 4);

+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +

+                              q0 + q1 + q2 + q3, 4);

+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +

+                              q0 + q1 + q2 + q3 + q4, 4);

+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +

+                              q0 + q1 + q2 + q3 + q4 + q5, 4);

+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +

+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);

+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +

+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);

+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +

+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);

+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +

+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);

+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +

+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);

+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +

+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);

+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +

+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);

+    *oq6 = ROUND_POWER_OF_TWO(p0 +

+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);

   } else {

-    int8_t filter1, filter2;

-    const int8_t ps1 = (int8_t) * op1 ^ 0x80;

-    const int8_t ps0 = (int8_t) * op0 ^ 0x80;

-    const int8_t qs0 = (int8_t) * oq0 ^ 0x80;

-    const int8_t qs1 = (int8_t) * oq1 ^ 0x80;

-    // add outer taps if we have high edge variance

-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;

-    // inner taps

-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;

-    filter1 = signed_char_clamp(filter + 4) >> 3;

-    filter2 = signed_char_clamp(filter + 3) >> 3;

-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;

-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

-    filter = filter1;

-    // outer tap adjustments

-    filter += 1;

-    filter >>= 1;

-    filter &= ~hev;

-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;

-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;

+    mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);

-void vp9_mb_lpf_horizontal_edge_w

-(

-  unsigned char *s,

-  int p,

-  const unsigned char *blimit,

-  const unsigned char *limit,

-  const unsigned char *thresh,

-  int count

-) {

-  signed char hev = 0; /* high edge variance */

-  signed char mask = 0;

-  signed char flat = 0;

-  signed char flat2 = 0;

-  int i = 0;

+void vp9_mb_lpf_horizontal_edge_w(uint8_t *s, int p,

+                                 const uint8_t *blimit,

+                                 const uint8_t *limit,

+                                 const uint8_t *thresh,

+                                 int count) {

+  int i;

-  /* loop filter designed to work using chars so that we can make maximum use

-   * of 8 bit simd instructions.

-   */

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

+  // loop filter designed to work using chars so that we can make maximum use

+  // of 8 bit simd instructions.

+  for (i = 0; i < 8 * count; ++i) {

+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

+    const int8_t mask = filter_mask(*limit, *blimit,

+                                    p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);

+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t flat2 = flatmask5(1,

+                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,

+                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);

-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

-    flat = flatmask4(1,

-                     s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                     s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

-    flat2 = flatmask5(1,

-                      s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],

-                      s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);

     wide_mbfilter(mask, hev, flat, flat2,

                   s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,

                   s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

@@ -591,71 +282,29 @@

                   s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);

     ++s;

-  } while (++i < count * 8);

+  }

-void vp9_mb_lpf_vertical_edge_w

-(

-  unsigned char *s,

-  int p,

-  const unsigned char *blimit,

-  const unsigned char *limit,

-  const unsigned char *thresh,

-  int count

-) {

-  signed char hev = 0; /* high edge variance */

-  signed char mask = 0;

-  signed char flat = 0;

-  signed char flat2 = 0;

-  int i = 0;

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4], s[-3], s[-2], s[-1],

-                       s[0], s[1], s[2], s[3]);

+void vp9_mb_lpf_vertical_edge_w(uint8_t *s, int p,

+                                const uint8_t *blimit,

+                                const uint8_t *limit,

+                                const uint8_t *thresh,

+                                int count) {

+  int i;

-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

-    flat = flatmask4(1,

-                     s[-4], s[-3], s[-2], s[-1],

-                     s[ 0], s[ 1], s[ 2], s[ 3]);

-    flat2 = flatmask5(1,

-                     s[-8], s[-7], s[-6], s[-5], s[-1],

-                     s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);

+  for (i = 0; i < 8 * count; ++i) {

+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];

+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];

+    const int8_t mask = filter_mask(*limit, *blimit,

+                                    p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);

+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

+    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,

+                                   q0, s[4], s[5], s[6], s[7]);

     wide_mbfilter(mask, hev, flat, flat2,

-                  s - 8, s - 7, s - 6, s - 5,

-                  s - 4, s - 3, s - 2, s - 1,

-                  s,     s + 1, s + 2, s + 3,

-                  s + 4, s + 5, s + 6, s + 7);

+                  s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,

+                  s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);

     s += p;

-  } while (++i < count * 8);

+  }

-void vp9_lpf_mbv_w_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                   unsigned char *v_ptr, int y_stride, int uv_stride,

-                   struct loop_filter_info *lfi) {

-  vp9_mb_lpf_vertical_edge_w(y_ptr, y_stride,

-                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_lpf_mbh_w_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                           unsigned char *v_ptr, int y_stride, int uv_stride,

-                           struct loop_filter_info *lfi) {

-  vp9_mb_lpf_horizontal_edge_w(y_ptr, y_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,

-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,

-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

--- a/vp9/common/vp9_mbpitch.c

+++ b/vp9/common/vp9_mbpitch.c

@@ -11,105 +11,18 @@

 #include "vp9/common/vp9_blockd.h"

-typedef enum {

-  PRED = 0,

-  DEST = 1

-} BLOCKSET;

+void vp9_setup_block_dptrs(MACROBLOCKD *mb,

+                           int subsampling_x, int subsampling_y) {

+  int i;

-static void setup_block(BLOCKD *b,

-                        int mv_stride,

-                        uint8_t **base,

-                        uint8_t **base2,

-                        int stride,

-                        int offset,

-                        BLOCKSET bs) {

-  if (bs == DEST) {

-    b->dst_stride = stride;

-    b->dst = offset;

-    b->base_dst = base;

-  } else {

-    b->pre_stride = stride;

-    b->pre = offset;

-    b->base_pre = base;

-    b->base_second_pre = base2;

+  for (i = 0; i < MAX_MB_PLANE; i++) {

+    mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;

+    mb->plane[i].subsampling_x = i ? subsampling_x : 0;

+    mb->plane[i].subsampling_y = i ? subsampling_y : 0;

-}

-static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {

-  int block;

-  uint8_t **y, **u, **v;

-  uint8_t **y2 = NULL, **u2 = NULL, **v2 = NULL;

-  BLOCKD *blockd = xd->block;

-  int stride;

-  if (bs == DEST) {

-    y = &xd->dst.y_buffer;

-    u = &xd->dst.u_buffer;

-    v = &xd->dst.v_buffer;

-  } else {

-    y = &xd->pre.y_buffer;

-    u = &xd->pre.u_buffer;

-    v = &xd->pre.v_buffer;

-    y2 = &xd->second_pre.y_buffer;

-    u2 = &xd->second_pre.u_buffer;

-    v2 = &xd->second_pre.v_buffer;

-  }

-  stride = xd->dst.y_stride;

-  for (block = 0; block < 16; block++) { /* y blocks */

-    setup_block(&blockd[block], stride, y, y2, stride,

-                (block >> 2) * 4 * stride + (block & 3) * 4, bs);

-  }

-  stride = xd->dst.uv_stride;

-  for (block = 16; block < 20; block++) { /* U and V blocks */

-    setup_block(&blockd[block], stride, u, u2, stride,

-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);

-    setup_block(&blockd[block + 4], stride, v, v2, stride,

-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);

-  }

-}

-void vp9_setup_block_dptrs(MACROBLOCKD *xd) {

-  int r, c;

-  BLOCKD *blockd = xd->block;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];

-      blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;

-    }

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

-      blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];

-      blockd[16 + r * 2 + c].predictor =

-        xd->predictor + 256 + r * 4 * 8 + c * 4;

-    }

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

-      blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];

-      blockd[20 + r * 2 + c].predictor =

-        xd->predictor + 320 + r * 4 * 8 + c * 4;

-    }

-  }

-  for (r = 0; r < 24; r++) {

-    blockd[r].qcoeff  = xd->qcoeff  + r * 16;

-    blockd[r].dqcoeff = xd->dqcoeff + r * 16;

-  }

-}

-void vp9_build_block_doffsets(MACROBLOCKD *xd) {

-  /* handle the destination pitch features */

-  setup_macroblock(xd, DEST);

-  setup_macroblock(xd, PRED);

+#if CONFIG_ALPHA

+  // TODO(jkoleszar): Using the Y w/h for now

+  mb->plane[3].subsampling_x = 0;

+  mb->plane[3].subsampling_y = 0;

+#endif

--- a/vp9/common/vp9_modecont.c

+++ b/vp9/common/vp9_modecont.c

@@ -11,12 +11,13 @@

 #include "vp9/common/vp9_entropy.h"

-const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4] = {

-  {1,       223,   1,    237},  // 0,0 best: Only candidate

-  {87,      166,   26,   219},  // 0,0 best: non zero candidates

-  {89,      67,    18,   125},  // 0,0 best: non zero candidates, split

-  {16,      141,   69,   226},  // strong nz candidate(s), no split

-  {35,      122,   14,   227},  // weak nz candidate(s), no split

-  {14,      122,   22,   164},  // strong nz candidate(s), split

-  {16,      70,    9,    183},  // weak nz candidate(s), split

+const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]

+                                           [VP9_INTER_MODES - 1] = {

+  {2,       173,   34},  // 0 = both zero mv

+  {7,       145,   85},  // 1 = one zero mv + one a predicted mv

+  {7,       166,   63},  // 2 = two predicted mvs

+  {7,       94,    66},  // 3 = one predicted/zero and one new mv

+  {8,       64,    46},  // 4 = two new mvs

+  {17,      81,    31},  // 5 = one intra neighbour + x

+  {25,      29,    30},  // 6 = two intra neighbours

};

--- a/vp9/common/vp9_modecont.h

+++ b/vp9/common/vp9_modecont.h

@@ -11,6 +11,9 @@

 #ifndef VP9_COMMON_VP9_MODECONT_H_

 #define VP9_COMMON_VP9_MODECONT_H_

-extern const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4];

+#include "vp9/common/vp9_entropy.h"

+extern const int vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]

+                                             [VP9_INTER_MODES - 1];

 #endif  // VP9_COMMON_VP9_MODECONT_H_

--- a/vp9/common/vp9_modecontext.c

+++ b/vp9/common/vp9_modecontext.c

@@ -11,137 +11,118 @@

 #include "vp9/common/vp9_entropymode.h"

-const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES]

-                                              [VP9_KF_BINTRAMODES]

-                                              [VP9_KF_BINTRAMODES] = {

-  {

-    /*Above Mode :  0*/

-    { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, /* left_mode 0 */

-    {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, /* left_mode 1 */

-    {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, /* left_mode 2 */

-    {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, /* left_mode 3 */

-    {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, /* left_mode 4 */

-    {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, /* left_mode 5 */

-    {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, /* left_mode 6 */

-    {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, /* left_mode 7 */

-    {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, /* left_mode 8 */

-    {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  1*/

-    {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, /* left_mode 0 */

-    {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, /* left_mode 1 */

-    {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, /* left_mode 2 */

-    {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, /* left_mode 3 */

-    {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, /* left_mode 4 */

-    {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, /* left_mode 5 */

-    {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, /* left_mode 6 */

-    {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, /* left_mode 7 */

-    {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, /* left_mode 8 */

-    {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  2*/

-    {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, /* left_mode 0 */

-    {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, /* left_mode 1 */

-    {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, /* left_mode 2 */

-    {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, /* left_mode 3 */

-    {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, /* left_mode 4 */

-    {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, /* left_mode 5 */

-    {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, /* left_mode 6 */

-    {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, /* left_mode 7 */

-    {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, /* left_mode 8 */

-    {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  3*/

-    {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, /* left_mode 0 */

-    {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, /* left_mode 1 */

-    {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, /* left_mode 2 */

-    {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, /* left_mode 3 */

-    {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, /* left_mode 4 */

-    {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, /* left_mode 5 */

-    {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, /* left_mode 6 */

-    {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, /* left_mode 7 */

-    {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, /* left_mode 8 */

-    {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  4*/

-    {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, /* left_mode 0 */

-    {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, /* left_mode 1 */

-    {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, /* left_mode 2 */

-    {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, /* left_mode 3 */

-    {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, /* left_mode 4 */

-    {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, /* left_mode 5 */

-    {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, /* left_mode 6 */

-    {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, /* left_mode 7 */

-    {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, /* left_mode 8 */

-    {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  5*/

-    {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, /* left_mode 0 */

-    {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, /* left_mode 1 */

-    {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, /* left_mode 2 */

-    {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, /* left_mode 3 */

-    {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, /* left_mode 4 */

-    {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, /* left_mode 5 */

-    {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, /* left_mode 6 */

-    {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, /* left_mode 7 */

-    {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, /* left_mode 8 */

-    {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  6*/

-    {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, /* left_mode 0 */

-    {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, /* left_mode 1 */

-    {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, /* left_mode 2 */

-    {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, /* left_mode 3 */

-    {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, /* left_mode 4 */

-    {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, /* left_mode 5 */

-    {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, /* left_mode 6 */

-    {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, /* left_mode 7 */

-    {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, /* left_mode 8 */

-    {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  7*/

-    {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, /* left_mode 0 */

-    {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, /* left_mode 1 */

-    {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, /* left_mode 2 */

-    {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, /* left_mode 3 */

-    {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, /* left_mode 4 */

-    {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, /* left_mode 5 */

-    {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, /* left_mode 6 */

-    {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, /* left_mode 7 */

-    {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, /* left_mode 8 */

-    {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  8*/

-    {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, /* left_mode 0 */

-    {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, /* left_mode 1 */

-    {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, /* left_mode 2 */

-    {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, /* left_mode 3 */

-    {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, /* left_mode 4 */

-    {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, /* left_mode 5 */

-    {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, /* left_mode 6 */

-    {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, /* left_mode 7 */

-    {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, /* left_mode 8 */

-    {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  9*/

-    {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, /* left_mode 0 */

-    {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, /* left_mode 1 */

-    {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, /* left_mode 2 */

-    {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, /* left_mode 3 */

-    {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, /* left_mode 4 */

-    {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, /* left_mode 5 */

-    {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, /* left_mode 6 */

-    {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, /* left_mode 7 */

-    {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, /* left_mode 8 */

-    {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, /* left_mode 9 */

-  },

+const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]

+                                         [VP9_INTRA_MODES]

+                                         [VP9_INTRA_MODES - 1] = {

+  { /* above = dc */

+    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,

+    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,

+    {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,

+    {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,

+    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,

+    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,

+    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,

+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,

+    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,

+    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */

+  }, { /* above = v */

+    {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,

+    {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,

+    {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,

+    {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,

+    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,

+    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,

+    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,

+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,

+    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,

+    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */

+  }, { /* above = h */

+    {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,

+    {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,

+    {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,

+    {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,

+    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,

+    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,

+    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,

+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,

+    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,

+    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */

+  }, { /* above = d45 */

+    { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,

+    {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,

+    {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,

+    {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,

+    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,

+    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,

+    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,

+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,

+    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,

+    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */

+  }, { /* above = d135 */

+    {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,

+    {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,

+    {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,

+    {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,

+    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,

+    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,

+    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,

+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,

+    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,

+    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */

+  }, { /* above = d117 */

+    {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,

+    {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,

+    {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,

+    {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,

+    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,

+    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,

+    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,

+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,

+    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,

+    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */

+  }, { /* above = d153 */

+    {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,

+    {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,

+    {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,

+    {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,

+    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,

+    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,

+    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,

+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,

+    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,

+    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */

+  }, { /* above = d27 */

+    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,

+    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,

+    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,

+    {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,

+    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,

+    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,

+    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,

+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,

+    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,

+    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */

+  }, { /* above = d63 */

+    {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,

+    {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,

+    {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,

+    {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,

+    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,

+    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,

+    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,

+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,

+    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,

+    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */

+  }, { /* above = tm */

+    {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,

+    {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,

+    {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,

+    {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,

+    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,

+    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,

+    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,

+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,

+    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,

+    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */

+  }

};

--- a/vp9/common/vp9_mvref_common.c

+++ b/vp9/common/vp9_mvref_common.c

@@ -11,35 +11,34 @@

 #include "vp9/common/vp9_mvref_common.h"

 #define MVREF_NEIGHBOURS 8

-static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {

-    {0, -1}, {-1, 0}, {-1, -1}, {0, -2},

-    {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}

+static int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {

+  // SB4X4

+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},

+  // SB4X8

+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},

+  // SB8X4

+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},

+  // SB8X8

+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},

+  // SB8X16

+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},

+  // SB16X8

+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},

+  // SB16X16

+  {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},

+  // SB16X32

+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},

+  // SB32X16

+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},

+  // SB32X32

+  {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},

+  // SB32X64

+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},

+  // SB64X32

+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},

+  // SB64X64

+  {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}

};

-static int mb_ref_distance_weight[MVREF_NEIGHBOURS] =

-  { 3, 3, 2, 1, 1, 1, 1, 1 };

-static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {

-    {0, -1}, {-1, 0}, {1, -1}, {-1, 1},

-    {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}

-};

-static int sb_ref_distance_weight[MVREF_NEIGHBOURS] =

-  { 3, 3, 2, 2, 2, 1, 1, 1 };

-static int sb64_mv_ref_search[MVREF_NEIGHBOURS][2] = {

-    {0, -1}, {-1, 0}, {1, -1}, {-1, 1},

-    {2, -1}, {-1, 2}, {3, -1}, {-1,-1}

-};

-static int sb64_ref_distance_weight[MVREF_NEIGHBOURS] =

-  { 1, 1, 1, 1, 1, 1, 1, 1 };

 // clamp_mv_ref

 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units

@@ -50,15 +49,21 @@

                                        xd->mb_to_bottom_edge + MV_BORDER);

-// Gets a candidate refenence motion vector from the given mode info

+// Gets a candidate reference motion vector from the given mode info

 // structure if one exists that matches the given reference frame.

 static int get_matching_candidate(const MODE_INFO *candidate_mi,

                                   MV_REFERENCE_FRAME ref_frame,

-                                  int_mv *c_mv) {

-  if (ref_frame == candidate_mi->mbmi.ref_frame) {

-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

-  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {

-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

+                                  int_mv *c_mv, int block_idx) {

+  if (ref_frame == candidate_mi->mbmi.ref_frame[0]) {

+    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)

+      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int;

+    else

+      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

+  } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) {

+    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)

+      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int;

+    else

+      c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

   } else {

     return 0;

@@ -66,7 +71,7 @@

   return 1;

-// Gets candidate refenence motion vector(s) from the given mode info

+// Gets candidate reference motion vector(s) from the given mode info

 // structure if they exists and do NOT match the given reference frame.

 static void get_non_matching_candidates(const MODE_INFO *candidate_mi,

                                         MV_REFERENCE_FRAME ref_frame,

@@ -81,18 +86,18 @@

   *c2_ref_frame = INTRA_FRAME;

   // If first candidate not valid neither will be.

-  if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) {

+  if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) {

     // First candidate

-    if (candidate_mi->mbmi.ref_frame != ref_frame) {

-      *c_ref_frame = candidate_mi->mbmi.ref_frame;

+    if (candidate_mi->mbmi.ref_frame[0] != ref_frame) {

+      *c_ref_frame = candidate_mi->mbmi.ref_frame[0];

       c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

     // Second candidate

-    if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&

-        (candidate_mi->mbmi.second_ref_frame != ref_frame) &&

+    if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) &&

+        (candidate_mi->mbmi.ref_frame[1] != ref_frame) &&

         (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) {

-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;

+      *c2_ref_frame = candidate_mi->mbmi.ref_frame[1];

       c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

@@ -103,10 +108,6 @@

 static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame,

                      MV_REFERENCE_FRAME candidate_ref_frame,

                      int_mv *candidate_mv, int *ref_sign_bias) {

-  // int frame_distances[MAX_REF_FRAMES];

-  // int last_distance = 1;

-  // int gf_distance = xd->frames_since_golden;

-  // int arf_distance = xd->frames_till_alt_ref_frame;

   // Sign inversion where appropriate.

   if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {

@@ -113,135 +114,35 @@

     candidate_mv->as_mv.row = -candidate_mv->as_mv.row;

     candidate_mv->as_mv.col = -candidate_mv->as_mv.col;

-  /*

-  // Scale based on frame distance if the reference frames not the same.

-  frame_distances[INTRA_FRAME] = 1;   // should never be used

-  frame_distances[LAST_FRAME] = 1;

-  frame_distances[GOLDEN_FRAME] =

-    (xd->frames_since_golden) ? xd->frames_si nce_golden : 1;

-  frame_distances[ALTREF_FRAME] =

-    (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;

-  if (frame_distances[this_ref_frame] &&

-      frame_distances[candidate_ref_frame]) {

-    candidate_mv->as_mv.row =

-      (short)(((int)(candidate_mv->as_mv.row) *

-               frame_distances[this_ref_frame]) /

-              frame_distances[candidate_ref_frame]);

-    candidate_mv->as_mv.col =

-      (short)(((int)(candidate_mv->as_mv.col) *

-               frame_distances[this_ref_frame]) /

-              frame_distances[candidate_ref_frame]);

-  }

-  */

-/*

-// Adds a new candidate reference vector to the sorted list.

-// If it is a repeat the weight of the existing entry is increased

-// and the order of the list is resorted.

-// This method of add plus sort has been deprecated for now as there is a

-// further sort of the best candidates in vp9_find_best_ref_mvs() and the

-// incremental benefit of both is small. If the decision is made to remove

-// the sort in vp9_find_best_ref_mvs() for performance reasons then it may be

-// worth re-instating some sort of list reordering by weight here.

-//

-static void addmv_and_shuffle(

-  int_mv *mv_list,

-  int *mv_scores,

-  int *refmv_count,

-  int_mv candidate_mv,

-  int weight

-) {

-  int i;

-  int insert_point;

-  int duplicate_found = FALSE;

-  // Check for duplicates. If there is one increase its score.

-  // We only compare vs the current top candidates.

-  insert_point = (*refmv_count < (MAX_MV_REF_CANDIDATES - 1))

-                 ? *refmv_count : (MAX_MV_REF_CANDIDATES - 1);

-  i = insert_point;

-  if (*refmv_count > i)

-    i++;

-  while (i > 0) {

-    i--;

-    if (candidate_mv.as_int == mv_list[i].as_int) {

-      duplicate_found = TRUE;

-      mv_scores[i] += weight;

-      break;

-    }

-  }

-  // If no duplicate and the new candidate is good enough then add it.

-  if (!duplicate_found ) {

-    if (weight > mv_scores[insert_point]) {

-      mv_list[insert_point].as_int = candidate_mv.as_int;

-      mv_scores[insert_point] = weight;

-      i = insert_point;

-    }

-    (*refmv_count)++;

-  }

-  // Reshuffle the list so that highest scoring mvs at the top.

-  while (i > 0) {

-    if (mv_scores[i] > mv_scores[i-1]) {

-      int tmp_score = mv_scores[i-1];

-      int_mv tmp_mv = mv_list[i-1];

-      mv_scores[i-1] = mv_scores[i];

-      mv_list[i-1] = mv_list[i];

-      mv_scores[i] = tmp_score;

-      mv_list[i] = tmp_mv;

-      i--;

-    } else

-      break;

-  }

-}

-*/

-// Adds a new candidate reference vector to the list.

-// The mv is thrown out if it is already in the list.

-// Unlike the addmv_and_shuffle() this does not reorder the list

-// but assumes that candidates are added in the order most likely to

-// match distance and reference frame bias.

+// Add a candidate mv.

+// Discard if it has already been seen.

 static void add_candidate_mv(int_mv *mv_list,  int *mv_scores,

                              int *candidate_count, int_mv candidate_mv,

                              int weight) {

-  int i;

-  // Make sure we dont insert off the end of the list

-  const int insert_point = MIN(*candidate_count, MAX_MV_REF_CANDIDATES - 1);

-  // Look for duplicates

-  for (i = 0; i <= insert_point; ++i) {

-    if (candidate_mv.as_int == mv_list[i].as_int)

-      break;

+  if (*candidate_count == 0) {

+    mv_list[0].as_int = candidate_mv.as_int;

+    mv_scores[0] = weight;

+    *candidate_count += 1;

+  } else if ((*candidate_count == 1) &&

+             (candidate_mv.as_int != mv_list[0].as_int)) {

+    mv_list[1].as_int = candidate_mv.as_int;

+    mv_scores[1] = weight;

+    *candidate_count += 1;

-  // Add the candidate. If the list is already full it is only desirable that

-  // it should overwrite if it has a higher weight than the last entry.

-  if (i >= insert_point && weight > mv_scores[insert_point]) {

-    mv_list[insert_point].as_int = candidate_mv.as_int;

-    mv_scores[insert_point] = weight;

-    *candidate_count += (*candidate_count < MAX_MV_REF_CANDIDATES);

-  }

-// This function searches the neighbourhood of a given MB/SB and populates a

-// list of candidate reference vectors.

+// This function searches the neighbourhood of a given MB/SB

+// to try and find candidate reference vectors.

//

-void vp9_find_mv_refs(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,

-                      MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,

-                      int_mv *mv_ref_list, int *ref_sign_bias) {

+void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,

+                          MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,

+                          int_mv *mv_ref_list, int *ref_sign_bias,

+                          int block_idx) {

   int i;

   MODE_INFO *candidate_mi;

   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  int_mv candidate_mvs[MAX_MV_REF_CANDIDATES];

   int_mv c_refmv;

   int_mv c2_refmv;

   MV_REFERENCE_FRAME c_ref_frame;

@@ -250,110 +151,119 @@

   int refmv_count = 0;

   int split_count = 0;

   int (*mv_ref_search)[2];

-  int *ref_distance_weight;

-  int zero_seen = FALSE;

-  const int mb_col = (-xd->mb_to_left_edge) >> 7;

+  const int mi_col = get_mi_col(xd);

+  const int mi_row = get_mi_row(xd);

+  int intra_count = 0;

+  int zero_count = 0;

+  int newmv_count = 0;

+  int x_idx = 0, y_idx = 0;

   // Blank the reference vector lists and other local structures.

   vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);

-  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);

   vpx_memset(candidate_scores, 0, sizeof(candidate_scores));

-  if (mbmi->sb_type == BLOCK_SIZE_SB64X64) {

-    mv_ref_search = sb64_mv_ref_search;

-    ref_distance_weight = sb64_ref_distance_weight;

-  } else if (mbmi->sb_type == BLOCK_SIZE_SB32X32) {

-    mv_ref_search = sb_mv_ref_search;

-    ref_distance_weight = sb_ref_distance_weight;

-  } else {

-    mv_ref_search = mb_mv_ref_search;

-    ref_distance_weight = mb_ref_distance_weight;

+  mv_ref_search = mv_ref_blocks[mbmi->sb_type];

+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {

+    x_idx = block_idx & 1;

+    y_idx = block_idx >> 1;

   // We first scan for candidate vectors that match the current reference frame

   // Look at nearest neigbours

   for (i = 0; i < 2; ++i) {

-    const int mb_search_col = mb_col + mv_ref_search[i][0];

+    const int mi_search_col = mi_col + mv_ref_search[i][0];

+    const int mi_search_row = mi_row + mv_ref_search[i][1];

+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&

+        (mi_search_col < cm->cur_tile_mi_col_end) &&

+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {

+      int b;

-    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&

-        (mb_search_col < cm->cur_tile_mb_col_end) &&

-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

       candidate_mi = here + mv_ref_search[i][0] +

                      (mv_ref_search[i][1] * xd->mode_info_stride);

-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {

-        add_candidate_mv(candidate_mvs, candidate_scores,

-                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);

+      if (block_idx >= 0) {

+        if (mv_ref_search[i][0])

+          b = 1 + y_idx * 2;

+        else

+          b = 2 + x_idx;

+      } else {

+        b = -1;

-      split_count += (candidate_mi->mbmi.mode == SPLITMV);

+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) {

+        add_candidate_mv(mv_ref_list, candidate_scores,

+                         &refmv_count, c_refmv, 16);

+      }

+      split_count += (candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 &&

+                      candidate_mi->mbmi.ref_frame[0] != INTRA_FRAME);

+      // Count number of neihgbours coded intra and zeromv

+      intra_count += (candidate_mi->mbmi.mode < NEARESTMV);

+      zero_count += (candidate_mi->mbmi.mode == ZEROMV);

+      newmv_count += (candidate_mi->mbmi.mode >= NEWMV);

-  // Look in the last frame if it exists

-  if (lf_here) {

-    candidate_mi = lf_here;

-    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {

-      add_candidate_mv(candidate_mvs, candidate_scores,

-                       &refmv_count, c_refmv, 18);

-    }

-  }

   // More distant neigbours

   for (i = 2; (i < MVREF_NEIGHBOURS) &&

-              (refmv_count < (MAX_MV_REF_CANDIDATES - 1)); ++i) {

-    const int mb_search_col = mb_col + mv_ref_search[i][0];

-    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&

-        (mb_search_col < cm->cur_tile_mb_col_end) &&

-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

+              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {

+    const int mi_search_col = mi_col + mv_ref_search[i][0];

+    const int mi_search_row = mi_row + mv_ref_search[i][1];

+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&

+        (mi_search_col < cm->cur_tile_mi_col_end) &&

+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {

       candidate_mi = here + mv_ref_search[i][0] +

                      (mv_ref_search[i][1] * xd->mode_info_stride);

-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {

-        add_candidate_mv(candidate_mvs, candidate_scores,

-                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);

+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {

+        add_candidate_mv(mv_ref_list, candidate_scores,

+                         &refmv_count, c_refmv, 16);

+  // Look in the last frame if it exists

+  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {

+    candidate_mi = lf_here;

+    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {

+      add_candidate_mv(mv_ref_list, candidate_scores,

+                       &refmv_count, c_refmv, 16);

+    }

+  }

   // If we have not found enough candidates consider ones where the

   // reference frame does not match. Break out when we have

   // MAX_MV_REF_CANDIDATES candidates.

   // Look first at spatial neighbours

-  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1)) {

-    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {

-      const int mb_search_col = mb_col + mv_ref_search[i][0];

+  for (i = 0; (i < MVREF_NEIGHBOURS) &&

+              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {

+    const int mi_search_col = mi_col + mv_ref_search[i][0];

+    const int mi_search_row = mi_row + mv_ref_search[i][1];

+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&

+        (mi_search_col < cm->cur_tile_mi_col_end) &&

+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {

+      candidate_mi = here + mv_ref_search[i][0] +

+                     (mv_ref_search[i][1] * xd->mode_info_stride);

-      if ((mb_search_col >= cm->cur_tile_mb_col_start) &&

-          (mb_search_col < cm->cur_tile_mb_col_end) &&

-          ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

+      get_non_matching_candidates(candidate_mi, ref_frame,

+                                  &c_ref_frame, &c_refmv,

+                                  &c2_ref_frame, &c2_refmv);

-        candidate_mi = here + mv_ref_search[i][0] +

-                       (mv_ref_search[i][1] * xd->mode_info_stride);

-        get_non_matching_candidates(candidate_mi, ref_frame,

-                                    &c_ref_frame, &c_refmv,

-                                    &c2_ref_frame, &c2_refmv);

-        if (c_ref_frame != INTRA_FRAME) {

-          scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);

-          add_candidate_mv(candidate_mvs, candidate_scores,

-                           &refmv_count, c_refmv, ref_distance_weight[i]);

-        }

-        if (c2_ref_frame != INTRA_FRAME) {

-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);

-          add_candidate_mv(candidate_mvs, candidate_scores,

-                           &refmv_count, c2_refmv, ref_distance_weight[i]);

-        }

+      if (c_ref_frame != INTRA_FRAME) {

+        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);

+        add_candidate_mv(mv_ref_list, candidate_scores,

+                         &refmv_count, c_refmv, 1);

-      if (refmv_count >= (MAX_MV_REF_CANDIDATES - 1)) {

-        break;

+      if (c2_ref_frame != INTRA_FRAME) {

+        scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);

+        add_candidate_mv(mv_ref_list, candidate_scores,

+                         &refmv_count, c2_refmv, 1);

   // Look at the last frame if it exists

-  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1) && lf_here) {

+  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {

     candidate_mi = lf_here;

     get_non_matching_candidates(candidate_mi, ref_frame,

                                 &c_ref_frame, &c_refmv,

@@ -361,49 +271,36 @@

     if (c_ref_frame != INTRA_FRAME) {

       scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);

-      add_candidate_mv(candidate_mvs, candidate_scores,

-                       &refmv_count, c_refmv, 2);

+      add_candidate_mv(mv_ref_list, candidate_scores,

+                       &refmv_count, c_refmv, 1);

     if (c2_ref_frame != INTRA_FRAME) {

       scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);

-      add_candidate_mv(candidate_mvs, candidate_scores,

-                       &refmv_count, c2_refmv, 2);

+      add_candidate_mv(mv_ref_list, candidate_scores,

+                       &refmv_count, c2_refmv, 1);

-  // Define inter mode coding context.

-  // 0,0 was best

-  if (candidate_mvs[0].as_int == 0) {

-    // 0,0 is only candidate

-    if (refmv_count <= 1) {

-      mbmi->mb_mode_context[ref_frame] = 0;

-    // non zero candidates candidates available

-    } else if (split_count == 0) {

-      mbmi->mb_mode_context[ref_frame] = 1;

+  if (!intra_count) {

+    if (!newmv_count) {

+      // 0 = both zero mv

+      // 1 = one zero mv + one a predicted mv

+      // 2 = two predicted mvs

+      mbmi->mb_mode_context[ref_frame] = 2 - zero_count;

     } else {

-      mbmi->mb_mode_context[ref_frame] = 2;

+      // 3 = one predicted/zero and one new mv

+      // 4 = two new mvs

+      mbmi->mb_mode_context[ref_frame] = 2 + newmv_count;

-  } else if (split_count == 0) {

-    // Non zero best, No Split MV cases

-    mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 3 : 4;

   } else {

-    // Non zero best, some split mv

-    mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 5 : 6;

+    // 5 = one intra neighbour + x

+    // 6 = two intra neighbours

+    mbmi->mb_mode_context[ref_frame] = 4 + intra_count;

-  // Scan for 0,0 case and clamp non zero choices

+  // Clamp vectors

   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

-    if (candidate_mvs[i].as_int == 0) {

-      zero_seen = TRUE;

-    } else {

-      clamp_mv_ref(xd, &candidate_mvs[i]);

-    }

+    clamp_mv_ref(xd, &mv_ref_list[i]);

-  // 0,0 is always a valid reference. Add it if not already seen.

-  if (!zero_seen)

-    candidate_mvs[MAX_MV_REF_CANDIDATES-1].as_int = 0;

-  // Copy over the candidate list.

-  vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));

--- a/vp9/common/vp9_mvref_common.h

+++ b/vp9/common/vp9_mvref_common.h

@@ -14,12 +14,24 @@

 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_

 #define VP9_COMMON_VP9_MVREF_COMMON_H_

-void vp9_find_mv_refs(VP9_COMMON *cm,

-                      MACROBLOCKD *xd,

-                      MODE_INFO *here,

-                      MODE_INFO *lf_here,

-                      MV_REFERENCE_FRAME ref_frame,

-                      int_mv *mv_ref_list,

-                      int *ref_sign_bias);

+void vp9_find_mv_refs_idx(VP9_COMMON *cm,

+                          MACROBLOCKD *xd,

+                          MODE_INFO *here,

+                          MODE_INFO *lf_here,

+                          MV_REFERENCE_FRAME ref_frame,

+                          int_mv *mv_ref_list,

+                          int *ref_sign_bias,

+                          int block_idx);

+static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,

+                                    MACROBLOCKD *xd,

+                                    MODE_INFO *here,

+                                    MODE_INFO *lf_here,

+                                    MV_REFERENCE_FRAME ref_frame,

+                                    int_mv *mv_ref_list,

+                                    int *ref_sign_bias) {

+  vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,

+                       mv_ref_list, ref_sign_bias, -1);

+}

 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_

--- a/vp9/common/vp9_onyx.h

+++ b/vp9/common/vp9_onyx.h

@@ -21,6 +21,9 @@

 #include "vpx/vp8cx.h"

 #include "vpx_scale/yv12config.h"

 #include "vp9/common/vp9_ppflags.h"

+#define MAX_MB_SEGMENTS 8

   typedef int *VP9_PTR;

   /* Create/destroy static data structures. */

@@ -225,8 +228,9 @@

   int vp9_set_roimap(VP9_PTR comp, unsigned char *map,

                      unsigned int rows, unsigned int cols,

-                     int delta_q[4], int delta_lf[4],

-                     unsigned int threshold[4]);

+                     int delta_q[MAX_MB_SEGMENTS],

+                     int delta_lf[MAX_MB_SEGMENTS],

+                     unsigned int threshold[MAX_MB_SEGMENTS]);

   int vp9_set_active_map(VP9_PTR comp, unsigned char *map,

                          unsigned int rows, unsigned int cols);

--- a/vp9/common/vp9_onyxc_int.h

+++ b/vp9/common/vp9_onyxc_int.h

@@ -18,28 +18,20 @@

 #include "vp9/common/vp9_entropymv.h"

 #include "vp9/common/vp9_entropy.h"

 #include "vp9/common/vp9_entropymode.h"

+#include "vp9/common/vp9_quant_common.h"

 #if CONFIG_POSTPROC

 #include "vp9/common/vp9_postproc.h"

 #endif

-/*#ifdef PACKET_TESTING*/

-#include "vp9/common/vp9_header.h"

-/*#endif*/

 /* Create/destroy static data structures. */

-void vp9_initialize_common(void);

+// Define the number of candidate reference buffers.

+#define NUM_REF_FRAMES 8

+#define NUM_REF_FRAMES_LG2 3

-#define MINQ 0

+#define ALLOWED_REFS_PER_FRAME 3

-#define MAXQ 255

-#define QINDEX_BITS 8

-#define QINDEX_RANGE (MAXQ + 1)

-#define NUM_REF_FRAMES 3

-#define NUM_REF_FRAMES_LG2 2

 // 1 scratch frame for the new frame, 3 for scaled references on the encoder

 // TODO(jkoleszar): These 3 extra references could probably come from the

 // normal reference pool.

@@ -48,107 +40,72 @@

 #define NUM_FRAME_CONTEXTS_LG2 2

 #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2)

-#define COMP_PRED_CONTEXTS   2

+#define MAX_LAG_BUFFERS 25

 typedef struct frame_contexts {

-  vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];

-  vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */

-  vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];

-  vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];

-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];

-  vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];

+  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];

+  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];

+  vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]

+                         [PARTITION_TYPES - 1];

-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];

-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];

-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];

-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_prob nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                        [NZC4X4_NODES];

-  vp9_prob nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                        [NZC8X8_NODES];

-  vp9_prob nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                          [NZC16X16_NODES];

-  vp9_prob nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                          [NZC32X32_NODES];

-  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]

-                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];

-#endif

   nmv_context nmvc;

   nmv_context pre_nmvc;

-  vp9_prob pre_bmode_prob[VP9_NKF_BINTRAMODES - 1];

-  vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */

-  vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1];

-  vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];

-  vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];

-  vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-  vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];

-  unsigned int bmode_counts[VP9_NKF_BINTRAMODES];

-  unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */

-  unsigned int sb_ymode_counts[VP9_I32X32_MODES];

-  unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES];

-  unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */

-  unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];

-  unsigned int mbsplit_counts[VP9_NUMMBSPLITS];

+  /* interframe intra mode probs */

+  vp9_prob pre_y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];

+  vp9_prob pre_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];

+  vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];

+  /* interframe intra mode probs */

+  unsigned int y_mode_counts[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];

+  unsigned int uv_mode_counts[VP9_INTRA_MODES][VP9_INTRA_MODES];

+  unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];

-  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];

-  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];

-  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];

-  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_prob pre_nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                            [NZC4X4_NODES];

-  vp9_prob pre_nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                            [NZC8X8_NODES];

-  vp9_prob pre_nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                              [NZC16X16_NODES];

-  vp9_prob pre_nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                              [NZC32X32_NODES];

-  vp9_prob pre_nzc_pcat_probs[MAX_NZC_CONTEXTS]

-                             [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];

-#endif

-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];

+  vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];

+  vp9_coeff_probs_model pre_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];

+  vp9_coeff_count_model coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];

   unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]

                                 [COEF_BANDS][PREV_COEF_CONTEXTS];

-#if CONFIG_CODE_NONZEROCOUNT

-  unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                             [NZC4X4_TOKENS];

-  unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                             [NZC8X8_TOKENS];

-  unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                               [NZC16X16_TOKENS];

-  unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                               [NZC32X32_TOKENS];

-  unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS]

-                              [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA][2];

-#endif

   nmv_context_counts NMVcount;

   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

                                  [VP9_SWITCHABLE_FILTERS - 1];

-#if CONFIG_COMP_INTERINTRA_PRED

-  unsigned int interintra_counts[2];

-  vp9_prob interintra_prob;

-  vp9_prob pre_interintra_prob;

-#endif

+  vp9_prob pre_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

+      [VP9_SWITCHABLE_FILTERS - 1];

+  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]

+                                      [VP9_SWITCHABLE_FILTERS];

-  int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];

-  unsigned int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];

+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];

+  vp9_prob pre_inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];

+  unsigned int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];

+  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];

+  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];

+  vp9_prob single_ref_prob[REF_CONTEXTS][2];

+  vp9_prob comp_ref_prob[REF_CONTEXTS];

+  vp9_prob pre_intra_inter_prob[INTRA_INTER_CONTEXTS];

+  vp9_prob pre_comp_inter_prob[COMP_INTER_CONTEXTS];

+  vp9_prob pre_single_ref_prob[REF_CONTEXTS][2];

+  vp9_prob pre_comp_ref_prob[REF_CONTEXTS];

+  unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];

+  unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];

+  unsigned int single_ref_count[REF_CONTEXTS][2][2];

+  unsigned int comp_ref_count[REF_CONTEXTS][2];

+  vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];

+  vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];

+  vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];

+  vp9_prob pre_tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];

+  vp9_prob pre_tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];

+  vp9_prob pre_tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];

+  unsigned int tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];

+  unsigned int tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];

+  unsigned int tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];

+  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];

+  vp9_prob pre_mbskip_probs[MBSKIP_CONTEXTS];

+  unsigned int mbskip_count[MBSKIP_CONTEXTS][2];

 } FRAME_CONTEXT;

 typedef enum {

-  RECON_CLAMP_REQUIRED        = 0,

-  RECON_CLAMP_NOTREQUIRED     = 1

-} CLAMP_TYPE;

-typedef enum {

   SINGLE_PREDICTION_ONLY = 0,

   COMP_PREDICTION_ONLY   = 1,

   HYBRID_PREDICTION      = 2,

@@ -167,8 +124,11 @@

 typedef struct VP9Common {

   struct vpx_internal_error_info  error;

-  DECLARE_ALIGNED(16, int16_t, Y1dequant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, int16_t, UVdequant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]);

+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]);

+#if CONFIG_ALPHA

+  DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][2]);

+#endif

   int width;

   int height;

@@ -177,8 +137,13 @@

   int last_width;

   int last_height;

+  // TODO(jkoleszar): this implies chroma ss right now, but could vary per

+  // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to

+  // support additional planes.

+  int subsampling_x;

+  int subsampling_y;

   YUV_TYPE clr_type;

-  CLAMP_TYPE  clamp_type;

   YV12_BUFFER_CONFIG *frame_to_show;

@@ -186,13 +151,15 @@

   int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */

   int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */

-  /* TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and

-   * roll new_fb_idx into it.

-   */

-  int active_ref_idx[3]; /* each frame can reference 3 buffers */

+  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and

+  // roll new_fb_idx into it.

+  // Each frame can reference ALLOWED_REFS_PER_FRAME buffers

+  int active_ref_idx[ALLOWED_REFS_PER_FRAME];

+  struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];

   int new_fb_idx;

-  struct scale_factors active_ref_scale[3];

   YV12_BUFFER_CONFIG post_proc_buffer;

   YV12_BUFFER_CONFIG temp_scale_frame;

@@ -201,28 +168,37 @@

   FRAME_TYPE frame_type;

   int show_frame;

+  int last_show_frame;

+  // Flag signaling that the frame is encoded using only INTRA modes.

+  int intra_only;

+  // Flag signaling that the frame context should be reset to default values.

+  // 0 or 1 implies don't reset, 2 reset just the context specified in the

+  // frame header, 3 reset all contexts.

+  int reset_frame_context;

   int frame_flags;

+  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in

+  // MODE_INFO (8-pixel) units.

   int MBs;

-  int mb_rows;

-  int mb_cols;

+  int mb_rows, mi_rows;

+  int mb_cols, mi_cols;

   int mode_info_stride;

   /* profile settings */

-  int experimental;

-  int mb_no_coeff_skip;

   TXFM_MODE txfm_mode;

-  COMPPREDMODE_TYPE comp_pred_mode;

-  int no_lpf;

-  int use_bilinear_mc_filter;

-  int full_pixel;

   int base_qindex;

   int last_kf_gf_q;  /* Q used on the last GF or KF */

-  int y1dc_delta_q;

-  int uvdc_delta_q;

-  int uvac_delta_q;

+  int y_dc_delta_q;

+  int uv_dc_delta_q;

+  int uv_ac_delta_q;

+#if CONFIG_ALPHA

+  int a_dc_delta_q;

+  int a_ac_delta_q;

+#endif

   unsigned int frames_since_golden;

   unsigned int frames_till_alt_ref_frame;

@@ -240,7 +216,6 @@

   unsigned char *last_frame_seg_map;

   INTERPOLATIONFILTERTYPE mcomp_filter_type;

-  LOOPFILTERTYPE filter_type;

   loop_filter_info_n lf_info;

@@ -247,49 +222,36 @@

   int filter_level;

   int last_sharpness_level;

   int sharpness_level;

-  int dering_enabled;

-  int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */

+  int refresh_frame_context;    /* Two state 0 = NO, 1 = YES */

   int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */

   /* Y,U,V */

-  ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */

-  ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */

+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];

+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];

+  // partition contexts

+  PARTITION_CONTEXT *above_seg_context;

+  PARTITION_CONTEXT left_seg_context[8];

   /* keyframe block modes are predicted by their above, left neighbors */

-  vp9_prob kf_bmode_prob[VP9_KF_BINTRAMODES]

-                        [VP9_KF_BINTRAMODES]

-                        [VP9_KF_BINTRAMODES - 1];

-  vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */

-  vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];

-  int kf_ymode_probs_index;

-  int kf_ymode_probs_update;

-  vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];

+  vp9_prob kf_y_mode_prob[VP9_INTRA_MODES]

+                         [VP9_INTRA_MODES]

+                         [VP9_INTRA_MODES - 1];

+  vp9_prob kf_uv_mode_prob[VP9_INTRA_MODES] [VP9_INTRA_MODES - 1];

-  vp9_prob prob_intra_coded;

-  vp9_prob prob_last_coded;

-  vp9_prob prob_gf_coded;

-  vp9_prob sb32_coded;

-  vp9_prob sb64_coded;

   // Context probabilities when using predictive coding of segment id

   vp9_prob segment_pred_probs[PREDICTION_PROBS];

   unsigned char temporal_update;

   // Context probabilities for reference frame prediction

-  unsigned char ref_scores[MAX_REF_FRAMES];

-  vp9_prob ref_pred_probs[PREDICTION_PROBS];

-  vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];

+  int allow_comp_inter_inter;

+  MV_REFERENCE_FRAME comp_fixed_ref;

+  MV_REFERENCE_FRAME comp_var_ref[2];

+  COMPPREDMODE_TYPE comp_pred_mode;

-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

-  // FIXME contextualize

-  vp9_prob prob_tx[TX_SIZE_MAX_SB - 1];

-  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];

   FRAME_CONTEXT fc;  /* this frame entropy */

   FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];

   unsigned int  frame_context_idx; /* Context to use/update */

@@ -298,9 +260,6 @@

   int near_boffset[3];

   int version;

-#ifdef PACKET_TESTING

-  VP9_HEADER oh;

-#endif

   double bitrate;

   double framerate;

@@ -308,17 +267,13 @@

   struct postproc_state  postproc_state;

 #endif

-#if CONFIG_COMP_INTERINTRA_PRED

-  int use_interintra;

-#endif

   int error_resilient_mode;

   int frame_parallel_decoding_mode;

   int tile_columns, log2_tile_columns;

-  int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_col_idx;

+  int cur_tile_mi_col_start, cur_tile_mi_col_end, cur_tile_col_idx;

   int tile_rows, log2_tile_rows;

-  int cur_tile_mb_row_start, cur_tile_mb_row_end, cur_tile_row_idx;

+  int cur_tile_mi_row_start, cur_tile_mi_row_end, cur_tile_row_idx;

 } VP9_COMMON;

 static int get_free_fb(VP9_COMMON *cm) {

@@ -341,31 +296,76 @@

   buf[new_idx]++;

-// TODO(debargha): merge the two functions

-static void set_mb_row(VP9_COMMON *cm, MACROBLOCKD *xd,

-                       int mb_row, int block_size) {

-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);

-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;

+static int mi_cols_aligned_to_sb(VP9_COMMON *cm) {

+  return 2 * ((cm->mb_cols + 3) & ~3);

+}

-  // Are edges available for intra prediction?

-  xd->up_available    = (mb_row != 0);

+static INLINE void set_partition_seg_context(VP9_COMMON *cm,

+                                             MACROBLOCKD *xd,

+                                             int mi_row, int mi_col) {

+  xd->above_seg_context = cm->above_seg_context + mi_col;

+  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);

-static void set_mb_col(VP9_COMMON *cm, MACROBLOCKD *xd,

-                       int mb_col, int block_size) {

-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);

-  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;

+static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,

+                                int mi_row, int mi_col,

+                                BLOCK_SIZE_TYPE bsize) {

+  int bsl = mi_width_log2(bsize), bs = 1 << bsl;

+  int ms = bs / 2;

+  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms < cm->mi_cols))

+    return 0;

+  // frame width/height are multiples of 8, hence 8x8 block should always

+  // pass the above check

+  assert(bsize > BLOCK_SIZE_SB8X8);

+  // return the node index in the prob tree for binary coding

+  // skip horizontal/none partition types

+  if ((mi_col + ms < cm->mi_cols) && (mi_row + ms >= cm->mi_rows))

+    return 1;

+  // skip vertical/none partition types

+  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms >= cm->mi_cols))

+    return 2;

+  return -1;

+}

+static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,

+                       int mi_row, int bh,

+                       int mi_col, int bw) {

+  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) << 3);

+  xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) << 3;

+  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) << 3);

+  xd->mb_to_right_edge  = ((cm->mi_cols - bw - mi_col) * MI_SIZE) << 3;

   // Are edges available for intra prediction?

-  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);

-  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);

+  xd->up_available    = (mi_row != 0);

+  xd->left_available  = (mi_col > cm->cur_tile_mi_col_start);

+  xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);

-static int get_mb_row(const MACROBLOCKD *xd) {

-  return ((-xd->mb_to_top_edge) >> 7);

+static int get_mi_row(const MACROBLOCKD *xd) {

+  return ((-xd->mb_to_top_edge) >> (3 + LOG2_MI_SIZE));

-static int get_mb_col(const MACROBLOCKD *xd) {

-  return ((-xd->mb_to_left_edge) >> 7);

+static int get_mi_col(const MACROBLOCKD *xd) {

+  return ((-xd->mb_to_left_edge) >> (3 + LOG2_MI_SIZE));

+}

+static int get_token_alloc(int mb_rows, int mb_cols) {

+  return mb_rows * mb_cols * (48 * 16 + 4);

+}

+static void set_prev_mi(VP9_COMMON *cm) {

+  const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&

+                                       cm->height == cm->last_height &&

+                                       !cm->error_resilient_mode &&

+                                       !cm->intra_only &&

+                                       cm->last_show_frame;

+  // Special case: set prev_mi to NULL when the previous mode info

+  // context cannot be used.

+  cm->prev_mi = use_prev_in_find_mv_refs ?

+                  cm->prev_mip + cm->mode_info_stride + 1 : NULL;

 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_

--- a/vp9/common/vp9_postproc.c

+++ b/vp9/common/vp9_postproc.c

@@ -53,7 +53,7 @@

   { RGB_TO_YUV(0xCC33FF) },   /* Magenta */

};

-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {

+static const unsigned char B_PREDICTION_MODE_colors[VP9_INTRA_MODES][3] = {

   { RGB_TO_YUV(0x6633ff) },   /* Purple */

   { RGB_TO_YUV(0xcc33ff) },   /* Magenta */

   { RGB_TO_YUV(0xff33cc) },   /* Pink */

@@ -132,7 +132,7 @@

 /****************************************************************************

*/

-void vp9_post_proc_down_and_across_c(uint8_t *src_ptr,

+void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,

                                      uint8_t *dst_ptr,

                                      int src_pixels_per_line,

                                      int dst_pixels_per_line,

@@ -139,7 +139,8 @@

                                      int rows,

                                      int cols,

                                      int flimit) {

-  uint8_t *p_src, *p_dst;

+  uint8_t const *p_src;

+  uint8_t *p_dst;

   int row;

   int col;

   int i;

@@ -313,51 +314,64 @@

                                 source->uv_height, source->uv_width, ppl);

-void vp9_deblock(YV12_BUFFER_CONFIG         *source,

-                 YV12_BUFFER_CONFIG         *post,

-                 int                         q,

-                 int                         low_var_thresh,

-                 int                         flag) {

-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

-  int ppl = (int)(level + .5);

-  (void) low_var_thresh;

-  (void) flag;

+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,

+                 int q) {

+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q

+                        + 0.0065 + 0.5);

+  int i;

-  vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,

-                                source->y_stride, post->y_stride,

-                                source->y_height, source->y_width, ppl);

+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,

+                                  src->alpha_buffer};

+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,

+                              src->alpha_stride};

+  const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width,

+                             src->alpha_width};

+  const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height,

+                              src->alpha_height};

-  vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,

-                                source->uv_stride, post->uv_stride,

-                                source->uv_height, source->uv_width, ppl);

+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,

+                            dst->alpha_buffer};

+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,

+                              dst->alpha_stride};

-  vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,

-                                source->uv_stride, post->uv_stride,

-                                source->uv_height, source->uv_width, ppl);

+  for (i = 0; i < MAX_MB_PLANE; ++i)

+    vp9_post_proc_down_and_across(srcs[i], dsts[i],

+                                  src_strides[i], dst_strides[i],

+                                  src_heights[i], src_widths[i], ppl);

-void vp9_denoise(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *post,

-                 int q, int low_var_thresh, int flag) {

-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

-  int ppl = (int)(level + .5);

-  (void) post;

-  (void) low_var_thresh;

-  (void) flag;

+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,

+                 int q) {

+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q

+                        + 0.0065 + 0.5);

+  int i;

-  vp9_post_proc_down_and_across(src->y_buffer + 2 * src->y_stride + 2,

-                                src->y_buffer + 2 * src->y_stride + 2,

-                                src->y_stride, src->y_stride, src->y_height - 4,

-                                src->y_width - 4, ppl);

+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,

+                                  src->alpha_buffer};

+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,

+                              src->alpha_stride};

+  const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width,

+                             src->alpha_width};

+  const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height,

+                              src->alpha_height};

-  vp9_post_proc_down_and_across(src->u_buffer + 2 * src->uv_stride + 2,

-                                src->u_buffer + 2 * src->uv_stride + 2,

-                                src->uv_stride, src->uv_stride,

-                                src->uv_height - 4, src->uv_width - 4, ppl);

+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,

+                            dst->alpha_buffer};

+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,

+                              dst->alpha_stride};

-  vp9_post_proc_down_and_across(src->v_buffer + 2 * src->uv_stride + 2,

-                                src->v_buffer + 2 * src->uv_stride + 2,

-                                src->uv_stride, src->uv_stride,

-                                src->uv_height - 4, src->uv_width - 4, ppl);

+  for (i = 0; i < MAX_MB_PLANE; ++i) {

+    const int src_stride = src_strides[i];

+    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;

+    const int src_width = src_widths[i] - 4;

+    const int src_height = src_heights[i] - 4;

+    const int dst_stride = dst_strides[i];

+    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;

+    vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,

+                                  src_height, src_width, ppl);

+  }

 double vp9_gaussian(double sigma, double mu, double x) {

@@ -631,13 +645,7 @@

   if (!flags) {

     *dest = *oci->frame_to_show;

-    /* handle problem with extending borders */

-    dest->y_width = oci->width;

-    dest->y_height = oci->height;

-    dest->uv_height = dest->y_height / 2;

     return 0;

 #if ARCH_X86||ARCH_X86_64

@@ -648,7 +656,7 @@

     deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,

                                q + (deblock_level - 5) * 10, 1, 0);

   } else if (flags & VP9D_DEBLOCK) {

-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q, 1, 0);

+    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);

   } else {

     vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);

@@ -727,7 +735,7 @@

     for (i = 0; i < mb_rows; i++) {

       for (j = 0; j < mb_cols; j++) {

         char zz[4];

-        int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&

+        int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&

                         mi[mb_index].mbmi.mode != SPLITMV &&

                         mi[mb_index].mbmi.mb_skip_coeff);

@@ -913,8 +921,8 @@

       for (x = 0; x < width; x += 16) {

         int Y = 0, U = 0, V = 0;

-        if (mi->mbmi.mode == B_PRED &&

-            ((ppflags->display_mb_modes_flag & B_PRED) ||

+        if (mi->mbmi.mode == I4X4_PRED &&

+            ((ppflags->display_mb_modes_flag & I4X4_PRED) ||

              ppflags->display_b_modes_flag)) {

           int by, bx;

           uint8_t *yl, *ul, *vl;

@@ -927,7 +935,7 @@

           for (by = 0; by < 16; by += 4) {

             for (bx = 0; bx < 16; bx += 4) {

               if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))

-                  || (ppflags->display_mb_modes_flag & B_PRED)) {

+                  || (ppflags->display_mb_modes_flag & I4X4_PRED)) {

                 Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];

                 U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];

                 V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];

--- a/vp9/common/vp9_postproc.h

+++ b/vp9/common/vp9_postproc.h

@@ -29,10 +29,8 @@

 int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,

                         vp9_ppflags_t *flags);

-void vp9_denoise(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,

-                 int q, int low_var_thresh, int flag);

+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);

-void vp9_deblock(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,

-                 int q, int low_var_thresh, int flag);

+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);

 #endif  // VP9_COMMON_VP9_POSTPROC_H_

--- a/vp9/common/vp9_pred_common.c

+++ b/vp9/common/vp9_pred_common.c

@@ -9,6 +9,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <limits.h>

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_pred_common.h"

 #include "vp9/common/vp9_seg_common.h"

@@ -21,8 +23,11 @@

                                    const MACROBLOCKD *const xd,

                                    PRED_ID pred_id) {

   int pred_context;

-  MODE_INFO *m = xd->mode_info_context;

+  const MODE_INFO *const mi = xd->mode_info_context;

+  const MODE_INFO *const above_mi = mi - cm->mode_info_stride;

+  const MODE_INFO *const left_mi = mi - 1;

+  const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image;

+  const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image;

   // Note:

   // The mode info data structure has a one element border above and to the

   // left of the entries correpsonding to real macroblocks.

@@ -29,77 +34,352 @@

   // The prediction flags in these dummy entries are initialised to 0.

   switch (pred_id) {

     case PRED_SEG_ID:

-      pred_context = (m - cm->mode_info_stride)->mbmi.seg_id_predicted;

+      pred_context = above_mi->mbmi.seg_id_predicted;

       if (xd->left_available)

-        pred_context += (m - 1)->mbmi.seg_id_predicted;

+        pred_context += left_mi->mbmi.seg_id_predicted;

       break;

-    case PRED_REF:

-      pred_context = (m - cm->mode_info_stride)->mbmi.ref_predicted;

+    case PRED_MBSKIP:

+      pred_context = above_mi->mbmi.mb_skip_coeff;

       if (xd->left_available)

-        pred_context += (m - 1)->mbmi.ref_predicted;

+        pred_context += left_mi->mbmi.mb_skip_coeff;

       break;

-    case PRED_COMP:

-      // Context based on use of comp pred flag by neighbours

-      // pred_context =

-      //   ((m - 1)->mbmi.second_ref_frame > INTRA_FRAME) +

-      //    ((m - cm->mode_info_stride)->mbmi.second_ref_frame > INTRA_FRAME);

+    case PRED_SWITCHABLE_INTERP: {

+      // left

+      const int left_mv_pred = is_inter_mode(left_mi->mbmi.mode);

+      const int left_interp = left_in_image && left_mv_pred ?

+                    vp9_switchable_interp_map[left_mi->mbmi.interp_filter] :

+                    VP9_SWITCHABLE_FILTERS;

-      // Context based on mode and reference frame

-      // if ( m->mbmi.ref_frame == LAST_FRAME )

-      //    pred_context = 0 + (m->mbmi.mode != ZEROMV);

-      // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )

-      //    pred_context = 2 + (m->mbmi.mode != ZEROMV);

-      // else

-      //    pred_context = 4 + (m->mbmi.mode != ZEROMV);

+      // above

+      const int above_mv_pred = is_inter_mode(above_mi->mbmi.mode);

+      const int above_interp = above_in_image && above_mv_pred ?

+                    vp9_switchable_interp_map[above_mi->mbmi.interp_filter] :

+                    VP9_SWITCHABLE_FILTERS;

-      if (m->mbmi.ref_frame == LAST_FRAME)

-        pred_context = 0;

+      assert(left_interp != -1);

+      assert(above_interp != -1);

+      if (left_interp == above_interp)

+        pred_context = left_interp;

+      else if (left_interp == VP9_SWITCHABLE_FILTERS &&

+               above_interp != VP9_SWITCHABLE_FILTERS)

+         pred_context = above_interp;

+      else if (left_interp != VP9_SWITCHABLE_FILTERS &&

+               above_interp == VP9_SWITCHABLE_FILTERS)

+        pred_context = left_interp;

       else

+        pred_context = VP9_SWITCHABLE_FILTERS;

+      break;

+    }

+    case PRED_INTRA_INTER: {

+      if (above_in_image && left_in_image) {  // both edges available

+        if (left_mi->mbmi.ref_frame[0] == INTRA_FRAME &&

+            above_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/intra (3)

+          pred_context = 3;

+        } else {  // intra/inter (1) or inter/inter (0)

+          pred_context = left_mi->mbmi.ref_frame[0] == INTRA_FRAME ||

+                         above_mi->mbmi.ref_frame[0] == INTRA_FRAME;

+        }

+      } else if (above_in_image || left_in_image) {  // one edge available

+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;

+        // inter: 0, intra: 2

+        pred_context = 2 * (edge->mbmi.ref_frame[0] == INTRA_FRAME);

+      } else {

+        pred_context = 0;

+      }

+      assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);

+      break;

+    }

+    case PRED_COMP_INTER_INTER: {

+      if (above_in_image && left_in_image) {  // both edges available

+        if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&

+            left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          // neither edge uses comp pred (0/1)

+          pred_context = ((above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref) ^

+                          (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref));

+        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          // one of two edges uses comp pred (2/3)

+          pred_context = 2 +

+              (above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||

+               above_mi->mbmi.ref_frame[0] == INTRA_FRAME);

+        } else if (left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          // one of two edges uses comp pred (2/3)

+          pred_context = 2 +

+              (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||

+               left_mi->mbmi.ref_frame[0] == INTRA_FRAME);

+        } else {  // both edges use comp pred (4)

+          pred_context = 4;

+        }

+      } else if (above_in_image || left_in_image) {  // one edge available

+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;

+        if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          // edge does not use comp pred (0/1)

+          pred_context = edge->mbmi.ref_frame[0] == cm->comp_fixed_ref;

+        } else {  // edge uses comp pred (3)

+          pred_context = 3;

+        }

+      } else {  // no edges available (1)

         pred_context = 1;

+      }

+      assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);

+      break;

+    }

+    case PRED_COMP_REF_P: {

+      const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];

+      const int var_ref_idx = !fix_ref_idx;

+      if (above_in_image && left_in_image) {  // both edges available

+        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&

+            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/intra (2)

+          pred_context = 2;

+        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||

+                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/inter

+          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?

+                                  left_mi : above_mi;

+          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {  // single pred (1/3)

+            pred_context = 1 +

+                2 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];

+          } else {  // comp pred (1/3)

+            pred_context = 1 +

+                2 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];

+          }

+        } else {  // inter/inter

+          int l_sg = left_mi->mbmi.ref_frame[1] <= INTRA_FRAME;

+          int a_sg = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME;

+          MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->mbmi.ref_frame[0] :

+              above_mi->mbmi.ref_frame[var_ref_idx];

+          MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->mbmi.ref_frame[0] :

+              left_mi->mbmi.ref_frame[var_ref_idx];

+          if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {

+            pred_context = 0;

+          } else if (l_sg && a_sg) {  // single/single

+            if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||

+                (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) {

+              pred_context = 4;

+            } else if (vrfa == vrfl) {

+              pred_context = 3;

+            } else {

+              pred_context = 1;

+            }

+          } else if (l_sg || a_sg) {  // single/comp

+            MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;

+            MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;

+            if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) {

+              pred_context = 1;

+            } else if (rfs == cm->comp_var_ref[1] &&

+                       vrfc != cm->comp_var_ref[1]) {

+              pred_context = 2;

+            } else {

+              pred_context = 4;

+            }

+          } else if (vrfa == vrfl) {  // comp/comp

+            pred_context = 4;

+          } else {

+            pred_context = 2;

+          }

+        }

+      } else if (above_in_image || left_in_image) {  // one edge available

+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;

+        if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {

+          pred_context = 2;

+        } else if (edge->mbmi.ref_frame[1] > INTRA_FRAME) {

+          pred_context =

+              4 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];

+        } else {

+          pred_context = 3 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];

+        }

+      } else {  // no edges available (2)

+        pred_context = 2;

+      }

+      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);

       break;

+    }

-    case PRED_MBSKIP:

-      pred_context = (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;

-      if (xd->left_available)

-        pred_context += (m - 1)->mbmi.mb_skip_coeff;

+    case PRED_SINGLE_REF_P1: {

+      if (above_in_image && left_in_image) {  // both edges available

+        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&

+            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {

+          pred_context = 2;

+        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||

+                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {

+          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?

+                                  left_mi : above_mi;

+          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {

+            pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);

+          } else {

+            pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||

+                                edge->mbmi.ref_frame[1] == LAST_FRAME);

+          }

+        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&

+                   left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          pred_context = 2 * (above_mi->mbmi.ref_frame[0] == LAST_FRAME) +

+                         2 * (left_mi->mbmi.ref_frame[0] == LAST_FRAME);

+        } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&

+                   left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {

+          pred_context = 1 + (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||

+                              above_mi->mbmi.ref_frame[1] == LAST_FRAME ||

+                              left_mi->mbmi.ref_frame[0] == LAST_FRAME ||

+                              left_mi->mbmi.ref_frame[1] == LAST_FRAME);

+        } else {

+          MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?

+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];

+          MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?

+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];

+          MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?

+              above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];

+          if (rfs == LAST_FRAME) {

+            pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);

+          } else {

+            pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;

+          }

+        }

+      } else if (above_in_image || left_in_image) {  // one edge available

+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;

+        if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {

+          pred_context = 2;

+        } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);

+        } else {

+          pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||

+                              edge->mbmi.ref_frame[1] == LAST_FRAME);

+        }

+      } else {  // no edges available (2)

+        pred_context = 2;

+      }

+      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);

       break;

+    }

-    case PRED_SWITCHABLE_INTERP:

-      {

-        int left_in_image = xd->left_available && (m - 1)->mbmi.mb_in_image;

-        int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

-        int left_mode = (m - 1)->mbmi.mode;

-        int above_mode = (m - cm->mode_info_stride)->mbmi.mode;

-        int left_interp, above_interp;

-        if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)

-          left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];

-        else

-          left_interp = VP9_SWITCHABLE_FILTERS;

-        assert(left_interp != -1);

-        if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)

-          above_interp = vp9_switchable_interp_map[

-              (m - cm->mode_info_stride)->mbmi.interp_filter];

-        else

-          above_interp = VP9_SWITCHABLE_FILTERS;

-        assert(above_interp != -1);

+    case PRED_SINGLE_REF_P2: {

+      if (above_in_image && left_in_image) {  // both edges available

+        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&

+            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {

+          pred_context = 2;

+        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||

+                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {

+          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?

+                                  left_mi : above_mi;

-        if (left_interp == above_interp)

-          pred_context = left_interp;

-        else if (left_interp == VP9_SWITCHABLE_FILTERS &&

-                 above_interp != VP9_SWITCHABLE_FILTERS)

-          pred_context = above_interp;

-        else if (left_interp != VP9_SWITCHABLE_FILTERS &&

-                 above_interp == VP9_SWITCHABLE_FILTERS)

-          pred_context = left_interp;

-        else

-          pred_context = VP9_SWITCHABLE_FILTERS;

+          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {

+            if (edge->mbmi.ref_frame[0] == LAST_FRAME) {

+              pred_context = 3;

+            } else {

+              pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);

+            }

+          } else {

+            pred_context = 1 + 2 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||

+                                    edge->mbmi.ref_frame[1] == GOLDEN_FRAME);

+          }

+        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&

+                   left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          if (above_mi->mbmi.ref_frame[0] == LAST_FRAME &&

+              left_mi->mbmi.ref_frame[0] == LAST_FRAME) {

+            pred_context = 3;

+          } else if (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||

+                     left_mi->mbmi.ref_frame[0] == LAST_FRAME) {

+            const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == LAST_FRAME ?

+                                    left_mi : above_mi;

+            pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);

+          } else {

+            pred_context = 2 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME) +

+                           2 * (left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME);

+          }

+        } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&

+                   left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {

+          if (above_mi->mbmi.ref_frame[0] == left_mi->mbmi.ref_frame[0] &&

+              above_mi->mbmi.ref_frame[1] == left_mi->mbmi.ref_frame[1]) {

+            pred_context = 3 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||

+                                above_mi->mbmi.ref_frame[1] == GOLDEN_FRAME ||

+                                left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||

+                                left_mi->mbmi.ref_frame[1] == GOLDEN_FRAME);

+          } else {

+            pred_context = 2;

+          }

+        } else {

+          MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?

+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];

+          MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?

+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];

+          MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?

+              above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];

+          if (rfs == GOLDEN_FRAME) {

+            pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);

+          } else if (rfs == ALTREF_FRAME) {

+            pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;

+          } else {

+            pred_context =

+                1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);

+          }

+        }

+      } else if (above_in_image || left_in_image) {  // one edge available

+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;

+        if (edge->mbmi.ref_frame[0] == INTRA_FRAME ||

+            (edge->mbmi.ref_frame[0] == LAST_FRAME &&

+             edge->mbmi.ref_frame[1] <= INTRA_FRAME)) {

+          pred_context = 2;

+        } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {

+          pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);

+        } else {

+          pred_context = 3 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||

+                              edge->mbmi.ref_frame[1] == GOLDEN_FRAME);

+        }

+      } else {  // no edges available (2)

+        pred_context = 2;

+      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);

       break;

+    }

+    case PRED_TX_SIZE: {

+      int above_context, left_context;

+      int max_tx_size;

+      if (mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)

+        max_tx_size = TX_4X4;

+      else if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)

+        max_tx_size = TX_8X8;

+      else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)

+        max_tx_size = TX_16X16;

+      else

+        max_tx_size = TX_32X32;

+      above_context = left_context = max_tx_size;

+      if (above_in_image) {

+        above_context = (above_mi->mbmi.mb_skip_coeff ?

+                         max_tx_size : above_mi->mbmi.txfm_size);

+      }

+      if (left_in_image) {

+        left_context = (left_mi->mbmi.mb_skip_coeff ?

+                        max_tx_size : left_mi->mbmi.txfm_size);

+      }

+      if (!left_in_image) {

+        left_context = above_context;

+      }

+      if (!above_in_image) {

+        above_context = left_context;

+      }

+      pred_context = (above_context + left_context > max_tx_size);

+      break;

+    }

     default:

+      assert(0);

       pred_context = 0;  // *** add error trap code.

       break;

@@ -117,16 +397,20 @@

   switch (pred_id) {

     case PRED_SEG_ID:

       return cm->segment_pred_probs[pred_context];

-    case PRED_REF:

-      return cm->ref_pred_probs[pred_context];

-    case PRED_COMP:

-      // In keeping with convention elsewhre the probability returned is

-      // the probability of a "0" outcome which in this case means the

-      // probability of comp pred off.

-      return cm->prob_comppred[pred_context];

     case PRED_MBSKIP:

-      return cm->mbskip_pred_probs[pred_context];

+      return cm->fc.mbskip_probs[pred_context];

+    case PRED_INTRA_INTER:

+      return cm->fc.intra_inter_prob[pred_context];

+    case PRED_COMP_INTER_INTER:

+      return cm->fc.comp_inter_prob[pred_context];

+    case PRED_COMP_REF_P:

+      return cm->fc.comp_ref_prob[pred_context];

+    case PRED_SINGLE_REF_P1:

+      return cm->fc.single_ref_prob[pred_context][0];

+    case PRED_SINGLE_REF_P2:

+      return cm->fc.single_ref_prob[pred_context][1];

     default:

+      assert(0);

       return 128;  // *** add error trap code.

@@ -136,23 +420,23 @@

 const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

                                    const MACROBLOCKD *const xd,

                                    PRED_ID pred_id) {

+  const MODE_INFO *const mi = xd->mode_info_context;

   const int pred_context = vp9_get_pred_context(cm, xd, pred_id);

   switch (pred_id) {

-    case PRED_SEG_ID:

-      return &cm->segment_pred_probs[pred_context];

-    case PRED_REF:

-      return &cm->ref_pred_probs[pred_context];

-    case PRED_COMP:

-      // In keeping with convention elsewhre the probability returned is

-      // the probability of a "0" outcome which in this case means the

-      // probability of comp pred off.

-      return &cm->prob_comppred[pred_context];

-    case PRED_MBSKIP:

-      return &cm->mbskip_pred_probs[pred_context];

     case PRED_SWITCHABLE_INTERP:

       return &cm->fc.switchable_interp_prob[pred_context][0];

+    case PRED_TX_SIZE:

+      if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)

+        return cm->fc.tx_probs_8x8p[pred_context];

+      else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)

+        return cm->fc.tx_probs_16x16p[pred_context];

+      else

+        return cm->fc.tx_probs_32x32p[pred_context];

     default:

+      assert(0);

       return NULL;  // *** add error trap code.

@@ -164,11 +448,10 @@

   switch (pred_id) {

     case PRED_SEG_ID:

       return xd->mode_info_context->mbmi.seg_id_predicted;

-    case PRED_REF:

-      return  xd->mode_info_context->mbmi.ref_predicted;

     case PRED_MBSKIP:

       return xd->mode_info_context->mbmi.mb_skip_coeff;

     default:

+      assert(0);

       return 0;  // *** add error trap code.

@@ -179,59 +462,34 @@

                        PRED_ID pred_id,

                        unsigned char pred_flag) {

   const int mis = xd->mode_info_stride;

+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;

+  const int bh = 1 << mi_height_log2(bsize);

+  const int bw = 1 << mi_width_log2(bsize);

+#define sub(a, b) (b) < 0 ? (a) + (b) : (a)

+  const int x_mis = sub(bw, xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE));

+  const int y_mis = sub(bh, xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE));

+#undef sub

+  int x, y;

   switch (pred_id) {

     case PRED_SEG_ID:

-      xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;

-      if (xd->mode_info_context->mbmi.sb_type) {

-#define sub(a, b) (b) < 0 ? (a) + (b) : (a)

-        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;

-        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);

-        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);

-        int x, y;

-        for (y = 0; y < y_mbs; y++) {

-          for (x = !y; x < x_mbs; x++) {

-            xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted =

-                pred_flag;

-          }

+      for (y = 0; y < y_mis; y++) {

+        for (x = 0; x < x_mis; x++) {

+          xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted = pred_flag;

       break;

-    case PRED_REF:

-      xd->mode_info_context->mbmi.ref_predicted = pred_flag;

-      if (xd->mode_info_context->mbmi.sb_type) {

-        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;

-        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);

-        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);

-        int x, y;

-        for (y = 0; y < y_mbs; y++) {

-          for (x = !y; x < x_mbs; x++) {

-            xd->mode_info_context[y * mis + x].mbmi.ref_predicted = pred_flag;

-          }

-        }

-      }

-      break;

     case PRED_MBSKIP:

-      xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;

-      if (xd->mode_info_context->mbmi.sb_type) {

-        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;

-        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);

-        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);

-        int x, y;

-        for (y = 0; y < y_mbs; y++) {

-          for (x = !y; x < x_mbs; x++) {

-            xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;

-          }

+      for (y = 0; y < y_mis; y++) {

+        for (x = 0; x < x_mis; x++) {

+          xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;

       break;

     default:

+      assert(0);

       // *** add error trap code.

       break;

@@ -242,162 +500,21 @@

 // peredict various bitstream signals.

 // Macroblock segment id prediction function

-unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

-                                    const MACROBLOCKD *const xd, int MbIndex) {

-  // Currently the prediction for the macroblock segment ID is

-  // the value stored for this macroblock in the previous frame.

-  if (!xd->mode_info_context->mbmi.sb_type) {

-    return cm->last_frame_seg_map[MbIndex];

-  } else {

-    const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;

-    const int mb_col = MbIndex % cm->mb_cols;

-    const int mb_row = MbIndex / cm->mb_cols;

-    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);

-    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);

-    int x, y;

-    unsigned seg_id = -1;

+int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,

+                          int mi_row, int mi_col) {

+  const int mi_index = mi_row * cm->mi_cols + mi_col;

+  const int bw = 1 << mi_width_log2(sb_type);

+  const int bh = 1 << mi_height_log2(sb_type);

+  const int ymis = MIN(cm->mi_rows - mi_row, bh);

+  const int xmis = MIN(cm->mi_cols - mi_col, bw);

+  int segment_id = INT_MAX;

+  int x, y;

-    for (y = mb_row; y < mb_row + y_mbs; y++) {

-      for (x = mb_col; x < mb_col + x_mbs; x++) {

-        seg_id = MIN(seg_id, cm->last_frame_seg_map[cm->mb_cols * y + x]);

-      }

+  for (y = 0; y < ymis; y++) {

+    for (x = 0; x < xmis; x++) {

+      const int index = mi_index + (y * cm->mi_cols + x);

+      segment_id = MIN(segment_id, cm->last_frame_seg_map[index]);

-    return seg_id;

-}

-MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

-                                    const MACROBLOCKD *const xd) {

-  MODE_INFO *m = xd->mode_info_context;

-  MV_REFERENCE_FRAME left;

-  MV_REFERENCE_FRAME above;

-  MV_REFERENCE_FRAME above_left;

-  MV_REFERENCE_FRAME pred_ref = LAST_FRAME;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int i;

-  unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};

-  unsigned char ref_score[MAX_REF_FRAMES];

-  unsigned char best_score = 0;

-  unsigned char left_in_image;

-  unsigned char above_in_image;

-  unsigned char above_left_in_image;

-  // Is segment coding ennabled

-  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

-  // Special case treatment if segment coding is enabled.

-  // Dont allow prediction of a reference frame that the segment

-  // does not allow

-  if (seg_ref_active) {

-    for (i = 0; i < MAX_REF_FRAMES; i++) {

-      frame_allowed[i] =

-        vp9_check_segref(xd, segment_id, i);

-      // Score set to 0 if ref frame not allowed

-      ref_score[i] = cm->ref_scores[i] * frame_allowed[i];

-    }

-  } else

-    vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));

-  // Reference frames used by neighbours

-  left = (m - 1)->mbmi.ref_frame;

-  above = (m - cm->mode_info_stride)->mbmi.ref_frame;

-  above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;

-  // Are neighbours in image

-  left_in_image = (m - 1)->mbmi.mb_in_image && xd->left_available;

-  above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

-  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image &&

-                        xd->left_available;

-  // Adjust scores for candidate reference frames based on neigbours

-  if (frame_allowed[left] && left_in_image) {

-    ref_score[left] += 16;

-    if (above_left_in_image && (left == above_left))

-      ref_score[left] += 4;

-  }

-  if (frame_allowed[above] && above_in_image) {

-    ref_score[above] += 16;

-    if (above_left_in_image && (above == above_left))

-      ref_score[above] += 4;

-  }

-  // Now choose the candidate with the highest score

-  for (i = 0; i < MAX_REF_FRAMES; i++) {

-    if (ref_score[i] > best_score) {

-      pred_ref = i;

-      best_score = ref_score[i];

-    }

-  }

-  return pred_ref;

-}

-// Functions to computes a set of modified reference frame probabilities

-// to use when the prediction of the reference frame value fails

-void vp9_calc_ref_probs(int *count, vp9_prob *probs) {

-  int tot_count = count[0] + count[1] + count[2] + count[3];

-  probs[0] = get_prob(count[0], tot_count);

-  tot_count -= count[0];

-  probs[1] = get_prob(count[1], tot_count);

-  tot_count -= count[1];

-  probs[2] = get_prob(count[2], tot_count);

-}

-// Computes a set of modified conditional probabilities for the reference frame

-// Values willbe set to 0 for reference frame options that are not possible

-// because wither they were predicted and prediction has failed or because

-// they are not allowed for a given segment.

-void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {

-  int norm_cnt[MAX_REF_FRAMES];

-  const int intra_count = cm->prob_intra_coded;

-  const int inter_count = (255 - intra_count);

-  const int last_count = (inter_count * cm->prob_last_coded) / 255;

-  const int gfarf_count = inter_count - last_count;

-  const int gf_count = (gfarf_count * cm->prob_gf_coded) / 255;

-  const int arf_count = gfarf_count - gf_count;

-  // Work out modified reference frame probabilities to use where prediction

-  // of the reference frame fails

-  norm_cnt[0] = 0;

-  norm_cnt[1] = last_count;

-  norm_cnt[2] = gf_count;

-  norm_cnt[3] = arf_count;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);

-  cm->mod_refprobs[INTRA_FRAME][0] = 0;    // This branch implicit

-  norm_cnt[0] = intra_count;

-  norm_cnt[1] = 0;

-  norm_cnt[2] = gf_count;

-  norm_cnt[3] = arf_count;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);

-  cm->mod_refprobs[LAST_FRAME][1] = 0;    // This branch implicit

-  norm_cnt[0] = intra_count;

-  norm_cnt[1] = last_count;

-  norm_cnt[2] = 0;

-  norm_cnt[3] = arf_count;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);

-  cm->mod_refprobs[GOLDEN_FRAME][2] = 0;  // This branch implicit

-  norm_cnt[0] = intra_count;

-  norm_cnt[1] = last_count;

-  norm_cnt[2] = gf_count;

-  norm_cnt[3] = 0;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);

-  cm->mod_refprobs[ALTREF_FRAME][2] = 0;  // This branch implicit

-  // Score the reference frames based on overal frequency.

-  // These scores contribute to the prediction choices.

-  // Max score 17 min 1

-  cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);

-  cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);

-  cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);

-  cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);

+  return segment_id;

--- a/vp9/common/vp9_pred_common.h

+++ b/vp9/common/vp9_pred_common.h

@@ -17,10 +17,14 @@

 // Predicted items

 typedef enum {

   PRED_SEG_ID = 0,  // Segment identifier

-  PRED_REF = 1,

-  PRED_COMP = 2,

-  PRED_MBSKIP = 3,

-  PRED_SWITCHABLE_INTERP = 4

+  PRED_MBSKIP = 1,

+  PRED_SWITCHABLE_INTERP = 2,

+  PRED_INTRA_INTER = 3,

+  PRED_COMP_INTER_INTER = 4,

+  PRED_SINGLE_REF_P1 = 5,

+  PRED_SINGLE_REF_P2 = 6,

+  PRED_COMP_REF_P = 7,

+  PRED_TX_SIZE = 8

 } PRED_ID;

 unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,

@@ -43,13 +47,7 @@

                        unsigned char pred_flag);

-unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

-                                    const MACROBLOCKD *const xd,

-                                    int MbIndex);

-MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

-                                    const MACROBLOCKD *const xd);

-void vp9_compute_mod_refprobs(VP9_COMMON *const cm);

+int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,

+                          int mi_row, int mi_col);

 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_

--- a/vp9/common/vp9_quant_common.c

+++ b/vp9/common/vp9_quant_common.c

@@ -10,46 +10,60 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_quant_common.h"

+#include "vp9/common/vp9_seg_common.h"

-static int dc_qlookup[QINDEX_RANGE];

-static int ac_qlookup[QINDEX_RANGE];

+static int16_t dc_qlookup[QINDEX_RANGE];

+static int16_t ac_qlookup[QINDEX_RANGE];

-#define ACDC_MIN 4

+#define ACDC_MIN 8

+// TODO(dkovalev) move to common and reuse

+static double poly3(double a, double b, double c, double d, double x) {

+  return a*x*x*x + b*x*x + c*x + d;

+}

 void vp9_init_quant_tables() {

-  int i;

-  int current_val = 4;

-  int last_val = 4;

-  int ac_val;

+  int i, val = 4;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    ac_qlookup[i] = current_val;

-    current_val = (int)(current_val * 1.02);

-    if (current_val == last_val)

-      current_val++;

-    last_val = current_val;

+  // A "real" q of 1.0 forces lossless mode.

+  // In practice non lossless Q's between 1.0 and 2.0 (represented here by

+  // integer values from 5-7 give poor rd results (lower psnr and often

+  // larger size than the lossless encode. To block out those "not very useful"

+  // values we increment the ac and dc q lookup values by 4 after position 0.

+  ac_qlookup[0] = val;

+  dc_qlookup[0] = val;

+  val += 4;

-    ac_val = ac_qlookup[i];

-    dc_qlookup[i] = (int)((0.000000305 * ac_val * ac_val * ac_val) +

-                          (-0.00065 * ac_val * ac_val) +

-                          (0.9 * ac_val) + 0.5);

-    if (dc_qlookup[i] < ACDC_MIN)

-      dc_qlookup[i] = ACDC_MIN;

+  for (i = 1; i < QINDEX_RANGE; i++) {

+    const int ac_val = val;

+    val = (int)(val * 1.01975);

+    if (val == ac_val)

+      ++val;

+    ac_qlookup[i] = (int16_t)ac_val;

+    dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9,

+                                                 0.5, ac_val));

-int vp9_dc_quant(int qindex, int delta) {

+int16_t vp9_dc_quant(int qindex, int delta) {

   return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];

-int vp9_dc_uv_quant(int qindex, int delta) {

-  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];

+int16_t vp9_ac_quant(int qindex, int delta) {

+  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];

-int vp9_ac_yquant(int qindex) {

-  return ac_qlookup[clamp(qindex, 0, MAXQ)];

-}

-int vp9_ac_uv_quant(int qindex, int delta) {

-  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];

+int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {

+    const int data = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

+    return xd->mb_segment_abs_delta == SEGMENT_ABSDATA ?

+               data :  // Abs value

+               clamp(base_qindex + data, 0, MAXQ);  // Delta value

+  } else {

+    return base_qindex;

+  }

--- a/vp9/common/vp9_quant_common.h

+++ b/vp9/common/vp9_quant_common.h

@@ -12,14 +12,17 @@

 #define VP9_COMMON_VP9_QUANT_COMMON_H_

 #include "vp9/common/vp9_blockd.h"

-#include "vp9/common/vp9_onyxc_int.h"

+#define MINQ 0

+#define MAXQ 255

+#define QINDEX_RANGE (MAXQ - MINQ + 1)

+#define QINDEX_BITS 8

 void vp9_init_quant_tables();

-int vp9_ac_yquant(int qindex);

-int vp9_dc_quant(int qindex, int delta);

-int vp9_dc2quant(int qindex, int delta);

-int vp9_ac2quant(int qindex, int delta);

-int vp9_dc_uv_quant(int qindex, int delta);

-int vp9_ac_uv_quant(int qindex, int delta);

+int16_t vp9_dc_quant(int qindex, int delta);

+int16_t vp9_ac_quant(int qindex, int delta);

+int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex);

 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_

--- a/vp9/common/vp9_recon.c

+++ /dev/null

@@ -1,202 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vp9_rtcd.h"

-#include "vp9/common/vp9_blockd.h"

-void vp9_recon_b_c(uint8_t *pred_ptr,

-                   int16_t *diff_ptr,

-                   uint8_t *dst_ptr,

-                   int stride) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);

-    }

-    dst_ptr += stride;

-    diff_ptr += 16;

-    pred_ptr += 16;

-  }

-}

-void vp9_recon_uv_b_c(uint8_t *pred_ptr,

-                      int16_t *diff_ptr,

-                      uint8_t *dst_ptr,

-                      int stride) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);

-    }

-    dst_ptr += stride;

-    diff_ptr += 8;

-    pred_ptr += 8;

-  }

-}

-void vp9_recon4b_c(uint8_t *pred_ptr,

-                   int16_t *diff_ptr,

-                   uint8_t *dst_ptr,

-                   int stride) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 16; c++) {

-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);

-    }

-    dst_ptr += stride;

-    diff_ptr += 16;

-    pred_ptr += 16;

-  }

-}

-void vp9_recon2b_c(uint8_t *pred_ptr,

-                   int16_t *diff_ptr,

-                   uint8_t *dst_ptr,

-                   int stride) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 8; c++) {

-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);

-    }

-    dst_ptr += stride;

-    diff_ptr += 8;

-    pred_ptr += 8;

-  }

-}

-void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {

-  int x, y;

-  BLOCKD *b = &xd->block[0];

-  int stride = b->dst_stride;

-  int16_t *diff = b->diff;

-  for (y = 0; y < 16; y++) {

-    for (x = 0; x < 16; x++) {

-      dst[x] = clip_pixel(dst[x] + diff[x]);

-    }

-    dst += stride;

-    diff += 16;

-  }

-}

-void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {

-  int x, y, i;

-  uint8_t *dst = udst;

-  for (i = 0; i < 2; i++, dst = vdst) {

-    BLOCKD *b = &xd->block[16 + 4 * i];

-    int stride = b->dst_stride;

-    int16_t *diff = b->diff;

-    for (y = 0; y < 8; y++) {

-      for (x = 0; x < 8; x++) {

-        dst[x] = clip_pixel(dst[x] + diff[x]);

-      }

-      dst += stride;

-      diff += 8;

-    }

-  }

-}

-void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {

-  int x, y, stride = xd->block[0].dst_stride;

-  int16_t *diff = xd->diff;

-  for (y = 0; y < 32; y++) {

-    for (x = 0; x < 32; x++) {

-      dst[x] = clip_pixel(dst[x] + diff[x]);

-    }

-    dst += stride;

-    diff += 32;

-  }

-}

-void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {

-  int x, y, stride = xd->block[16].dst_stride;

-  int16_t *udiff = xd->diff + 1024;

-  int16_t *vdiff = xd->diff + 1280;

-  for (y = 0; y < 16; y++) {

-    for (x = 0; x < 16; x++) {

-      udst[x] = clip_pixel(udst[x] + udiff[x]);

-      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);

-    }

-    udst += stride;

-    vdst += stride;

-    udiff += 16;

-    vdiff += 16;

-  }

-}

-void vp9_recon_sb64y_s_c(MACROBLOCKD *xd, uint8_t *dst) {

-  int x, y, stride = xd->block[0].dst_stride;

-  int16_t *diff = xd->diff;

-  for (y = 0; y < 64; y++) {

-    for (x = 0; x < 64; x++) {

-      dst[x] = clip_pixel(dst[x] + diff[x]);

-    }

-    dst += stride;

-    diff += 64;

-  }

-}

-void vp9_recon_sb64uv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {

-  int x, y, stride = xd->block[16].dst_stride;

-  int16_t *udiff = xd->diff + 4096;

-  int16_t *vdiff = xd->diff + 4096 + 1024;

-  for (y = 0; y < 32; y++) {

-    for (x = 0; x < 32; x++) {

-      udst[x] = clip_pixel(udst[x] + udiff[x]);

-      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);

-    }

-    udst += stride;

-    vdst += stride;

-    udiff += 32;

-    vdiff += 32;

-  }

-}

-void vp9_recon_mby_c(MACROBLOCKD *xd) {

-  int i;

-  for (i = 0; i < 16; i += 4) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-}

-void vp9_recon_mb_c(MACROBLOCKD *xd) {

-  int i;

-  for (i = 0; i < 16; i += 4) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-}

--- a/vp9/common/vp9_reconinter.c

+++ b/vp9/common/vp9_reconinter.c

@@ -17,22 +17,110 @@

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_reconintra.h"

+static int scale_value_x_with_scaling(int val,

+                                      const struct scale_factors *scale) {

+  return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);

+}

+static int scale_value_y_with_scaling(int val,

+                                      const struct scale_factors *scale) {

+  return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);

+}

+static int unscaled_value(int val, const struct scale_factors *scale) {

+  (void) scale;

+  return val;

+}

+static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv,

+                                         const struct scale_factors *scale) {

+  // returns mv * scale + offset

+  int_mv32 result;

+  const int32_t mv_row_q4 = src_mv->as_mv.row << 1;

+  const int32_t mv_col_q4 = src_mv->as_mv.col << 1;

+  result.as_mv.row = (mv_row_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)

+                      + scale->y_offset_q4;

+  result.as_mv.col = (mv_col_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)

+                      + scale->x_offset_q4;

+  return result;

+}

+static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv,

+                                            const struct scale_factors *scale) {

+  // returns mv * scale + offset

+  int_mv32 result;

+  result.as_mv.row = src_mv->as_mv.row << 1;

+  result.as_mv.col = src_mv->as_mv.col << 1;

+  return result;

+}

+static int32_t mv_component_q4_with_scaling(int mv_q4, int scale_fp,

+                                            int offset_q4) {

+  int32_t scaled_mv;

+  // returns the scaled and offset value of the mv component.

+  scaled_mv = (mv_q4 * scale_fp >> VP9_REF_SCALE_SHIFT) + offset_q4;

+  return scaled_mv;

+}

+static int32_t mv_component_q4_without_scaling(int mv_q4, int scale_fp,

+                                               int offset_q4) {

+  // returns the scaled and offset value of the mv component.

+  (void)scale_fp;

+  (void)offset_q4;

+  return mv_q4;

+}

+static void set_offsets_with_scaling(struct scale_factors *scale,

+                                     int row, int col) {

+  const int x_q4 = 16 * col;

+  const int y_q4 = 16 * row;

+  scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;

+  scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;

+}

+static void set_offsets_without_scaling(struct scale_factors *scale,

+                                        int row, int col) {

+  scale->x_offset_q4 = 0;

+  scale->y_offset_q4 = 0;

+}

+static int get_fixed_point_scale_factor(int other_size, int this_size) {

+  // Calculate scaling factor once for each reference frame

+  // and use fixed point scaling factors in decoding and encoding routines.

+  // Hardware implementations can calculate scale factor in device driver

+  // and use multiplication and shifting on hardware instead of division.

+  return (other_size << VP9_REF_SCALE_SHIFT) / this_size;

+}

 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,

-                                       YV12_BUFFER_CONFIG *other,

+                                       int other_w, int other_h,

                                        int this_w, int this_h) {

-  int other_h = other->y_crop_height;

-  int other_w = other->y_crop_width;

-  scale->x_num = other_w;

-  scale->x_den = this_w;

+  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);

   scale->x_offset_q4 = 0;  // calculated per-mb

-  scale->x_step_q4 = 16 * other_w / this_w;

+  scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);

-  scale->y_num = other_h;

-  scale->y_den = this_h;

+  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);

   scale->y_offset_q4 = 0;  // calculated per-mb

-  scale->y_step_q4 = 16 * other_h / this_h;

+  scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);

+  if ((other_w == this_w) && (other_h == this_h)) {

+    scale->scale_value_x = unscaled_value;

+    scale->scale_value_y = unscaled_value;

+    scale->set_scaled_offsets = set_offsets_without_scaling;

+    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;

+    scale->scale_mv_component_q4 = mv_component_q4_without_scaling;

+  } else {

+    scale->scale_value_x = scale_value_x_with_scaling;

+    scale->scale_value_y = scale_value_y_with_scaling;

+    scale->set_scaled_offsets = set_offsets_with_scaling;

+    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;

+    scale->scale_mv_component_q4 = mv_component_q4_with_scaling;

+  }

   // TODO(agrange): Investigate the best choice of functions to use here

   // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what

   // to do at full-pel offsets. The current selection, where the filter is

@@ -39,131 +127,10 @@

   // applied in one direction only, and not at all for 0,0, seems to give the

   // best quality, but it may be worth trying an additional mode that does

   // do the filtering on full-pel.

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

   if (scale->x_step_q4 == 16) {

     if (scale->y_step_q4 == 16) {

       // No scaling in either direction.

       scale->predict[0][0][0] = vp9_convolve_copy;

-      scale->predict[0][0][1] = vp9_convolve_1by8;

-      scale->predict[0][0][2] = vp9_convolve_qtr;

-      scale->predict[0][0][3] = vp9_convolve_3by8;

-      scale->predict[0][0][4] = vp9_convolve_avg;

-      scale->predict[0][0][5] = vp9_convolve_5by8;

-      scale->predict[0][0][6] = vp9_convolve_3qtr;

-      scale->predict[0][0][7] = vp9_convolve_7by8;

-      scale->predict[0][1][0] = vp9_convolve8_vert;

-      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;

-      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;

-      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;

-      scale->predict[0][1][4] = vp9_convolve8_avg_vert;

-      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;

-      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;

-      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;

-      scale->predict[1][0][0] = vp9_convolve8_horiz;

-      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;

-      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;

-      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;

-      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;

-      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;

-      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;

-      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;

-    } else {

-      // No scaling in x direction. Must always scale in the y direction.

-      scale->predict[0][0][0] = vp9_convolve8_vert;

-      scale->predict[0][0][1] = vp9_convolve8_1by8_vert;

-      scale->predict[0][0][2] = vp9_convolve8_qtr_vert;

-      scale->predict[0][0][3] = vp9_convolve8_3by8_vert;

-      scale->predict[0][0][4] = vp9_convolve8_avg_vert;

-      scale->predict[0][0][5] = vp9_convolve8_5by8_vert;

-      scale->predict[0][0][6] = vp9_convolve8_3qtr_vert;

-      scale->predict[0][0][7] = vp9_convolve8_7by8_vert;

-      scale->predict[0][1][0] = vp9_convolve8_vert;

-      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;

-      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;

-      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;

-      scale->predict[0][1][4] = vp9_convolve8_avg_vert;

-      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;

-      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;

-      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;

-      scale->predict[1][0][0] = vp9_convolve8;

-      scale->predict[1][0][1] = vp9_convolve8_1by8;

-      scale->predict[1][0][2] = vp9_convolve8_qtr;

-      scale->predict[1][0][3] = vp9_convolve8_3by8;

-      scale->predict[1][0][4] = vp9_convolve8_avg;

-      scale->predict[1][0][5] = vp9_convolve8_5by8;

-      scale->predict[1][0][6] = vp9_convolve8_3qtr;

-      scale->predict[1][0][7] = vp9_convolve8_7by8;

-    }

-  } else {

-    if (scale->y_step_q4 == 16) {

-      // No scaling in the y direction. Must always scale in the x direction.

-      scale->predict[0][0][0] = vp9_convolve8_horiz;

-      scale->predict[0][0][1] = vp9_convolve8_1by8_horiz;

-      scale->predict[0][0][2] = vp9_convolve8_qtr_horiz;

-      scale->predict[0][0][3] = vp9_convolve8_3by8_horiz;

-      scale->predict[0][0][4] = vp9_convolve8_avg_horiz;

-      scale->predict[0][0][5] = vp9_convolve8_5by8_horiz;

-      scale->predict[0][0][6] = vp9_convolve8_3qtr_horiz;

-      scale->predict[0][0][7] = vp9_convolve8_7by8_horiz;

-      scale->predict[0][1][0] = vp9_convolve8;

-      scale->predict[0][1][1] = vp9_convolve8_1by8;

-      scale->predict[0][1][2] = vp9_convolve8_qtr;

-      scale->predict[0][1][3] = vp9_convolve8_3by8;

-      scale->predict[0][1][4] = vp9_convolve8_avg;

-      scale->predict[0][1][5] = vp9_convolve8_5by8;

-      scale->predict[0][1][6] = vp9_convolve8_3qtr;

-      scale->predict[0][1][7] = vp9_convolve8_7by8;

-      scale->predict[1][0][0] = vp9_convolve8_horiz;

-      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;

-      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;

-      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;

-      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;

-      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;

-      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;

-      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;

-    } else {

-      // Must always scale in both directions.

-      scale->predict[0][0][0] = vp9_convolve8;

-      scale->predict[0][0][1] = vp9_convolve8_1by8;

-      scale->predict[0][0][2] = vp9_convolve8_qtr;

-      scale->predict[0][0][3] = vp9_convolve8_3by8;

-      scale->predict[0][0][4] = vp9_convolve8_avg;

-      scale->predict[0][0][5] = vp9_convolve8_5by8;

-      scale->predict[0][0][6] = vp9_convolve8_3qtr;

-      scale->predict[0][0][7] = vp9_convolve8_7by8;

-      scale->predict[0][1][0] = vp9_convolve8;

-      scale->predict[0][1][1] = vp9_convolve8_1by8;

-      scale->predict[0][1][2] = vp9_convolve8_qtr;

-      scale->predict[0][1][3] = vp9_convolve8_3by8;

-      scale->predict[0][1][4] = vp9_convolve8_avg;

-      scale->predict[0][1][5] = vp9_convolve8_5by8;

-      scale->predict[0][1][6] = vp9_convolve8_3qtr;

-      scale->predict[0][1][7] = vp9_convolve8_7by8;

-      scale->predict[1][0][0] = vp9_convolve8;

-      scale->predict[1][0][1] = vp9_convolve8_1by8;

-      scale->predict[1][0][2] = vp9_convolve8_qtr;

-      scale->predict[1][0][3] = vp9_convolve8_3by8;

-      scale->predict[1][0][4] = vp9_convolve8_avg;

-      scale->predict[1][0][5] = vp9_convolve8_5by8;

-      scale->predict[1][0][6] = vp9_convolve8_3qtr;

-      scale->predict[1][0][7] = vp9_convolve8_7by8;

-    }

-  }

-  // 2D subpel motion always gets filtered in both directions

-  scale->predict[1][1][0] = vp9_convolve8;

-  scale->predict[1][1][1] = vp9_convolve8_1by8;

-  scale->predict[1][1][2] = vp9_convolve8_qtr;

-  scale->predict[1][1][3] = vp9_convolve8_3by8;

-  scale->predict[1][1][4] = vp9_convolve8_avg;

-  scale->predict[1][1][5] = vp9_convolve8_5by8;

-  scale->predict[1][1][6] = vp9_convolve8_3qtr;

-  scale->predict[1][1][7] = vp9_convolve8_7by8;

-}

-#else

-  if (scale->x_step_q4 == 16) {

-    if (scale->y_step_q4 == 16) {

-      // No scaling in either direction.

-      scale->predict[0][0][0] = vp9_convolve_copy;

       scale->predict[0][0][1] = vp9_convolve_avg;

       scale->predict[0][1][0] = vp9_convolve8_vert;

       scale->predict[0][1][1] = vp9_convolve8_avg_vert;

@@ -201,35 +168,19 @@

   scale->predict[1][1][0] = vp9_convolve8;

   scale->predict[1][1][1] = vp9_convolve8_avg;

-#endif

 void vp9_setup_interp_filters(MACROBLOCKD *xd,

                               INTERPOLATIONFILTERTYPE mcomp_filter_type,

                               VP9_COMMON *cm) {

-  int i;

-  /* Calculate scaling factors for each of the 3 available references */

-  for (i = 0; i < 3; ++i) {

-    if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {

-      memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));

-      continue;

-    }

-    vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],

-                                      &cm->yv12_fb[cm->active_ref_idx[i]],

-                                      cm->width, cm->height);

-  }

   if (xd->mode_info_context) {

     MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

     set_scale_factors(xd,

-                      mbmi->ref_frame - 1,

-                      mbmi->second_ref_frame - 1,

+                      mbmi->ref_frame[0] - 1,

+                      mbmi->ref_frame[1] - 1,

                       cm->active_ref_scale);

   switch (mcomp_filter_type) {

     case EIGHTTAP:

     case SWITCHABLE:

@@ -244,11 +195,6 @@

     case BILINEAR:

       xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;

       break;

-#if CONFIG_ENABLE_6TAP

-    case SIXTAP:

-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;

-      break;

-#endif

   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);

@@ -340,53 +286,6 @@

-static void set_scaled_offsets(struct scale_factors *scale,

-                               int row, int col) {

-  const int x_q4 = 16 * col;

-  const int y_q4 = 16 * row;

-  scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;

-  scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;

-}

-static int32_t scale_motion_vector_component_q3(int mv_q3,

-                                                int num,

-                                                int den,

-                                                int offset_q4) {

-  // returns the scaled and offset value of the mv component.

-  const int32_t mv_q4 = mv_q3 << 1;

-  /* TODO(jkoleszar): make fixed point, or as a second multiply? */

-  return mv_q4 * num / den + offset_q4;

-}

-static int32_t scale_motion_vector_component_q4(int mv_q4,

-                                                int num,

-                                                int den,

-                                                int offset_q4) {

-  // returns the scaled and offset value of the mv component.

-  /* TODO(jkoleszar): make fixed point, or as a second multiply? */

-  return mv_q4 * num / den + offset_q4;

-}

-static int_mv32 scale_motion_vector_q3_to_q4(

-    const int_mv *src_mv,

-    const struct scale_factors *scale) {

-  // returns mv * scale + offset

-  int_mv32 result;

-  result.as_mv.row = scale_motion_vector_component_q3(src_mv->as_mv.row,

-                                                      scale->y_num,

-                                                      scale->y_den,

-                                                      scale->y_offset_q4);

-  result.as_mv.col = scale_motion_vector_component_q3(src_mv->as_mv.col,

-                                                      scale->x_num,

-                                                      scale->x_den,

-                                                      scale->x_offset_q4);

-  return result;

-}

 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,

                                uint8_t *dst, int dst_stride,

                                const int_mv *mv_q3,

@@ -393,7 +292,7 @@

                                const struct scale_factors *scale,

                                int w, int h, int weight,

                                const struct subpix_fn_table *subpix) {

-  int_mv32 mv = scale_motion_vector_q3_to_q4(mv_q3, scale);

+  int_mv32 mv = scale->scale_mv_q3_to_q4(mv_q3, scale);

   src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);

   scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight](

       src, src_stride, dst, dst_stride,

@@ -402,26 +301,18 @@

       w, h);

-/* Like vp9_build_inter_predictor, but takes the full-pel part of the

- * mv separately, and the fractional part as a q4.

- */

 void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,

                                   uint8_t *dst, int dst_stride,

-                                  const int_mv *fullpel_mv_q3,

-                                  const int_mv *frac_mv_q4,

+                                  const int_mv *mv_q4,

                                   const struct scale_factors *scale,

                                   int w, int h, int weight,

                                   const struct subpix_fn_table *subpix) {

-  const int mv_row_q4 = ((fullpel_mv_q3->as_mv.row >> 3) << 4)

-                        + (frac_mv_q4->as_mv.row & 0xf);

-  const int mv_col_q4 = ((fullpel_mv_q3->as_mv.col >> 3) << 4)

-                        + (frac_mv_q4->as_mv.col & 0xf);

-  const int scaled_mv_row_q4 =

-      scale_motion_vector_component_q4(mv_row_q4, scale->y_num, scale->y_den,

-                                       scale->y_offset_q4);

-  const int scaled_mv_col_q4 =

-      scale_motion_vector_component_q4(mv_col_q4, scale->x_num, scale->x_den,

-                                       scale->x_offset_q4);

+  const int scaled_mv_row_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.row,

+                                                            scale->y_scale_fp,

+                                                            scale->y_offset_q4);

+  const int scaled_mv_col_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.col,

+                                                            scale->x_scale_fp,

+                                                            scale->x_offset_q4);

   const int subpel_x = scaled_mv_col_q4 & 15;

   const int subpel_y = scaled_mv_row_q4 & 15;

@@ -433,1367 +324,205 @@

       w, h);

-static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1,

-                                         struct scale_factors *scale,

-                                         uint8_t *predictor,

-                                         int block_size, int stride,

-                                         int which_mv, int weight,

-                                         int width, int height,

-                                         const struct subpix_fn_table *subpix,

-                                         int row, int col) {

-  assert(d1->predictor - d0->predictor == block_size);

-  assert(d1->pre == d0->pre + block_size);

+static INLINE int round_mv_comp_q4(int value) {

+  return (value < 0 ? value - 2 : value + 2) / 4;

+}

-  set_scaled_offsets(&scale[which_mv], row, col);

-  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {

-    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;

-    vp9_build_inter_predictor(*base_pre + d0->pre,

-                              d0->pre_stride,

-                              predictor, stride,

-                              &d0->bmi.as_mv[which_mv],

-                              &scale[which_mv],

-                              width, height,

-                              weight, subpix);

-  } else {

-    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;

-    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;

-    vp9_build_inter_predictor(*base_pre0 + d0->pre,

-                              d0->pre_stride,

-                              predictor, stride,

-                              &d0->bmi.as_mv[which_mv],

-                              &scale[which_mv],

-                              width > block_size ? block_size : width, height,

-                              weight, subpix);

-    if (width <= block_size) return;

-    set_scaled_offsets(&scale[which_mv], row, col + block_size);

-    vp9_build_inter_predictor(*base_pre1 + d1->pre,

-                              d1->pre_stride,

-                              predictor + block_size, stride,

-                              &d1->bmi.as_mv[which_mv],

-                              &scale[which_mv],

-                              width - block_size, height,

-                              weight, subpix);

-  }

+static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) {

+  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row +

+                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row +

+                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row +

+                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row;

+  return round_mv_comp_q4(temp);

-static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1,

-                                      struct scale_factors *scale,

-                                      int block_size, int stride,

-                                      int which_mv, int weight,

-                                      const struct subpix_fn_table *subpix,

-                                      int row, int col) {

-  assert(d1->predictor - d0->predictor == block_size);

-  assert(d1->pre == d0->pre + block_size);

-  set_scaled_offsets(&scale[which_mv], row, col);

-  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {

-    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;

-    vp9_build_inter_predictor(*base_pre + d0->pre,

-                              d0->pre_stride,

-                              d0->predictor, stride,

-                              &d0->bmi.as_mv[which_mv],

-                              &scale[which_mv],

-                              2 * block_size, block_size,

-                              weight, subpix);

-  } else {

-    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;

-    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;

-    vp9_build_inter_predictor(*base_pre0 + d0->pre,

-                              d0->pre_stride,

-                              d0->predictor, stride,

-                              &d0->bmi.as_mv[which_mv],

-                              &scale[which_mv],

-                              block_size, block_size,

-                              weight, subpix);

-    set_scaled_offsets(&scale[which_mv], row, col + block_size);

-    vp9_build_inter_predictor(*base_pre1 + d1->pre,

-                              d1->pre_stride,

-                              d1->predictor, stride,

-                              &d1->bmi.as_mv[which_mv],

-                              &scale[which_mv],

-                              block_size, block_size,

-                              weight, subpix);

-  }

+static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) {

+  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col +

+                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col +

+                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col +

+                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col;

+  return round_mv_comp_q4(temp);

-static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

+// TODO(jkoleszar): yet another mv clamping function :-(

+MV clamp_mv_to_umv_border_sb(const MV *src_mv,

+    int bwl, int bhl, int ss_x, int ss_y,

+    int mb_to_left_edge, int mb_to_top_edge,

+    int mb_to_right_edge, int mb_to_bottom_edge) {

   /* If the MV points so far into the UMV border that no visible pixels

    * are used for reconstruction, the subpel part of the MV can be

    * discarded and the MV limited to 16 pixels with equivalent results.

-   *

-   * This limit kicks in at 19 pixels for the top and left edges, for

-   * the 16 pixels plus 3 taps right of the central pixel when subpel

-   * filtering. The bottom and right edges use 16 pixels plus 2 pixels

-   * left of the central pixel when filtering.

*/

-  if (mv->col < (xd->mb_to_left_edge - ((16 + VP9_INTERP_EXTEND) << 3)))

-    mv->col = xd->mb_to_left_edge - (16 << 3);

-  else if (mv->col > xd->mb_to_right_edge + ((15 + VP9_INTERP_EXTEND) << 3))

-    mv->col = xd->mb_to_right_edge + (16 << 3);

+  const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;

+  const int spel_right = spel_left - (1 << 4);

+  const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;

+  const int spel_bottom = spel_top - (1 << 4);

+  MV clamped_mv;

-  if (mv->row < (xd->mb_to_top_edge - ((16 + VP9_INTERP_EXTEND) << 3)))

-    mv->row = xd->mb_to_top_edge - (16 << 3);

-  else if (mv->row > xd->mb_to_bottom_edge + ((15 + VP9_INTERP_EXTEND) << 3))

-    mv->row = xd->mb_to_bottom_edge + (16 << 3);

+  assert(ss_x <= 1);

+  assert(ss_y <= 1);

+  clamped_mv.col = clamp(src_mv->col << (1 - ss_x),

+                         (mb_to_left_edge << (1 - ss_x)) - spel_left,

+                         (mb_to_right_edge << (1 - ss_x)) + spel_right);

+  clamped_mv.row = clamp(src_mv->row << (1 - ss_y),

+                         (mb_to_top_edge << (1 - ss_y)) - spel_top,

+                         (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);

+  return clamped_mv;

-/* A version of the above function for chroma block MVs.*/

-static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

-  const int extend = VP9_INTERP_EXTEND;

+struct build_inter_predictors_args {

+  MACROBLOCKD *xd;

+  int x;

+  int y;

+  uint8_t* dst[MAX_MB_PLANE];

+  int dst_stride[MAX_MB_PLANE];

+  uint8_t* pre[2][MAX_MB_PLANE];

+  int pre_stride[2][MAX_MB_PLANE];

+};

+static void build_inter_predictors(int plane, int block,

+                                   BLOCK_SIZE_TYPE bsize,

+                                   int pred_w, int pred_h,

+                                   void *argv) {

+  const struct build_inter_predictors_args* const arg = argv;

+  MACROBLOCKD * const xd = arg->xd;

+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;

+  const int bh = 4 << bhl, bw = 4 << bwl;

+  const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);

+  const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;

+  int which_mv;

-  mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + extend) << 3))) ?

-            (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;

-  mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + extend) << 3)) ?

-            (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;

+  assert(x < bw);

+  assert(y < bh);

+  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||

+         4 << pred_w == bw);

+  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||

+         4 << pred_h == bh);

-  mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + extend) << 3))) ?

-            (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;

-  mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + extend) << 3)) ?

-            (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;

-}

+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+    // source

+    const uint8_t * const base_pre = arg->pre[which_mv][plane];

+    const int pre_stride = arg->pre_stride[which_mv][plane];

+    const uint8_t *const pre = base_pre +

+        scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);

+    struct scale_factors * const scale =

+      plane == 0 ? &xd->scale_factor[which_mv] : &xd->scale_factor_uv[which_mv];

-#define AVERAGE_WEIGHT  (1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT))

+    // dest

+    uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+    // motion vector

+    const MV *mv;

+    MV split_chroma_mv;

+    int_mv clamped_mv;

-// Whether to use implicit weighting for UV

-#define USE_IMPLICIT_WEIGHT_UV

-// Whether to use implicit weighting for SplitMV

-// #define USE_IMPLICIT_WEIGHT_SPLITMV

-// #define SEARCH_MIN3

-static int64_t get_consistency_metric(MACROBLOCKD *xd,

-                                      uint8_t *tmp_y, int tmp_ystride) {

-  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;

-  uint8_t *rec_y = xd->dst.y_buffer;

-  int rec_ystride = xd->dst.y_stride;

-  int64_t metric = 0;

-  int i;

-  if (xd->up_available) {

-    for (i = 0; i < block_size; ++i) {

-      int diff = abs(*(rec_y - rec_ystride + i) -

-                     *(tmp_y + i));

-#ifdef SEARCH_MIN3

-      // Searches for the min abs diff among 3 pixel neighbors in the border

-      int diff1 = xd->left_available ?

-          abs(*(rec_y - rec_ystride + i - 1) - *(tmp_y + i)) : diff;

-      int diff2 = i < block_size - 1 ?

-          abs(*(rec_y - rec_ystride + i + 1) - *(tmp_y + i)) : diff;

-      diff = diff <= diff1 ? diff : diff1;

-      diff = diff <= diff2 ? diff : diff2;

-#endif

-      metric += diff;

-    }

-  }

-  if (xd->left_available) {

-    for (i = 0; i < block_size; ++i) {

-      int diff = abs(*(rec_y - 1 + i * rec_ystride) -

-                     *(tmp_y + i * tmp_ystride));

-#ifdef SEARCH_MIN3

-      // Searches for the min abs diff among 3 pixel neighbors in the border

-      int diff1 = xd->up_available ?

-          abs(*(rec_y - 1 + (i - 1) * rec_ystride) -

-                      *(tmp_y + i * tmp_ystride)) : diff;

-      int diff2 = i < block_size - 1 ?

-          abs(*(rec_y - 1 + (i + 1) * rec_ystride) -

-              *(tmp_y + i * tmp_ystride)) : diff;

-      diff = diff <= diff1 ? diff : diff1;

-      diff = diff <= diff2 ? diff : diff2;

-#endif

-      metric += diff;

-    }

-  }

-  return metric;

-}

-static int get_weight(MACROBLOCKD *xd, int64_t metric_1, int64_t metric_2) {

-  int weight = AVERAGE_WEIGHT;

-  if (2 * metric_1 < metric_2)

-    weight = 6;

-  else if (4 * metric_1 < 3 * metric_2)

-    weight = 5;

-  else if (2 * metric_2 < metric_1)

-    weight = 2;

-  else if (4 * metric_2 < 3 * metric_1)

-    weight = 3;

-  return weight;

-}

-#ifdef USE_IMPLICIT_WEIGHT_SPLITMV

-static int get_implicit_compoundinter_weight_splitmv(

-    MACROBLOCKD *xd, int mb_row, int mb_col) {

-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

-  BLOCKD *blockd = xd->block;

-  const int use_second_ref = mbmi->second_ref_frame > 0;

-  int64_t metric_2 = 0, metric_1 = 0;

-  int i, which_mv, weight;

-  uint8_t tmp_y[256];

-  const int tmp_ystride = 16;

-  if (!use_second_ref) return 0;

-  if (!(xd->up_available || xd->left_available))

-    return AVERAGE_WEIGHT;

-  assert(xd->mode_info_context->mbmi.mode == SPLITMV);

-  which_mv = 1;  // second predictor

-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

-    for (i = 0; i < 16; i += 8) {

-      BLOCKD *d0 = &blockd[i];

-      BLOCKD *d1 = &blockd[i + 2];

-      const int y = i & 8;

-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];

-      if (mbmi->need_to_clamp_mvs) {

-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);

-      }

-      if (i == 0) {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

-                                     which_mv, 0, 16, 1,

-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

-                                     which_mv, 0, 1, 8,

-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {

+      if (plane == 0) {

+        mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv;

       } else {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,

-                                     8, 16, which_mv, 0, 1, 8,

-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+        // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the

+        // same MV (the average of the 4 luma MVs) but we could do something

+        // smarter for non-4:2:0. Just punt for now, pending the changes to get

+        // rid of SPLITMV mode entirely.

+        split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv);

+        split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv);

+        mv = &split_chroma_mv;

+    } else {

+      mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv;

-  } else {

-    for (i = 0; i < 16; i += 2) {

-      BLOCKD *d0 = &blockd[i];

-      BLOCKD *d1 = &blockd[i + 1];

-      const int x = (i & 3) * 4;

-      const int y = (i >> 2) * 4;

-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

+    /* TODO(jkoleszar): This clamping is done in the incorrect place for the

+     * scaling case. It needs to be done on the scaled MV, not the pre-scaling

+     * MV. Note however that it performs the subsampling aware scaling so

+     * that the result is always q4.

+     */

+    clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl,

+                                                 xd->plane[plane].subsampling_x,

+                                                 xd->plane[plane].subsampling_y,

+                                                 xd->mb_to_left_edge,

+                                                 xd->mb_to_top_edge,

+                                                 xd->mb_to_right_edge,

+                                                 xd->mb_to_bottom_edge);

+    scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);

-      if (i >= 4 && (i & 3) != 0) continue;

-      if (i == 0) {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

-                                     which_mv, 0, 8, 1, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

-                                     which_mv, 0, 1, 4, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-      } else if (i < 4) {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,

-                                     which_mv, 0, 8, 1, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-      } else {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,

-                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-      }

-    }

+    vp9_build_inter_predictor_q4(pre, pre_stride,

+                                 dst, arg->dst_stride[plane],

+                                 &clamped_mv, &xd->scale_factor[which_mv],

+                                 4 << pred_w, 4 << pred_h, which_mv,

+                                 &xd->subpix);

-  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);

-  which_mv = 0;  // first predictor

-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

-    for (i = 0; i < 16; i += 8) {

-      BLOCKD *d0 = &blockd[i];

-      BLOCKD *d1 = &blockd[i + 2];

-      const int y = i & 8;

-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];

-      if (mbmi->need_to_clamp_mvs) {

-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);

-      }

-      if (i == 0) {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

-                                     which_mv, 0, 16, 1,

-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

-                                     which_mv, 0, 1, 8,

-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

-      } else {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,

-                                     8, 16, which_mv, 0, 1, 8,

-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

-      }

-    }

-  } else {

-    for (i = 0; i < 16; i += 2) {

-      BLOCKD *d0 = &blockd[i];

-      BLOCKD *d1 = &blockd[i + 1];

-      const int x = (i & 3) * 4;

-      const int y = (i >> 2) * 4;

-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

-      if (i >= 4 && (i & 3) != 0) continue;

-      if (i == 0) {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

-                                     which_mv, 0, 8, 1, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

-                                     which_mv, 0, 1, 4, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-      } else if (i < 4) {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,

-                                     which_mv, 0, 8, 1, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-      } else {

-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,

-                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,

-                                     mb_row * 16 + y, mb_col * 16 + x);

-      }

-    }

-  }

-  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);

-  // Choose final weight for averaging

-  weight = get_weight(xd, metric_1, metric_2);

-  return weight;

-#endif

+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,

+                                    int mi_row,

+                                    int mi_col,

+                                    BLOCK_SIZE_TYPE bsize) {

+  struct build_inter_predictors_args args = {

+    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,

+    {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0},

+    {{xd->plane[0].pre[0].buf, NULL, NULL},

+     {xd->plane[0].pre[1].buf, NULL, NULL}},

+    {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}},

+  };

-static int get_implicit_compoundinter_weight(MACROBLOCKD *xd,

-                                             int mb_row,

-                                             int mb_col) {

-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

-  int64_t metric_2 = 0, metric_1 = 0;

-  int n, clamp_mvs, pre_stride;

-  uint8_t *base_pre;

-  int_mv ymv;

-  uint8_t tmp_y[4096];

-  const int tmp_ystride = 64;

-  int weight;

-  int edge[4];

-  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;

-  if (!use_second_ref) return 0;

-  if (!(xd->up_available || xd->left_available))

-    return AVERAGE_WEIGHT;

-  edge[0] = xd->mb_to_top_edge;

-  edge[1] = xd->mb_to_bottom_edge;

-  edge[2] = xd->mb_to_left_edge;

-  edge[3] = xd->mb_to_right_edge;

-  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_secondmv;

-  base_pre = xd->second_pre.y_buffer;

-  pre_stride = xd->second_pre.y_stride;

-  ymv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

-  // First generate the second predictor

-  for (n = 0; n < block_size; n += 16) {

-    xd->mb_to_left_edge   = edge[2] - (n << 3);

-    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&ymv.as_mv, xd);

-    set_scaled_offsets(&xd->scale_factor[1], mb_row * 16, mb_col * 16 + n);

-    // predict a single row of pixels

-    vp9_build_inter_predictor(

-        base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[1]),

-        pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[1],

-        16, 1, 0, &xd->subpix);

-  }

-  xd->mb_to_left_edge = edge[2];

-  xd->mb_to_right_edge = edge[3];

-  for (n = 0; n < block_size; n += 16) {

-    xd->mb_to_top_edge    = edge[0] - (n << 3);

-    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&ymv.as_mv, xd);

-    set_scaled_offsets(&xd->scale_factor[1], mb_row * 16 + n, mb_col * 16);

-    // predict a single col of pixels

-    vp9_build_inter_predictor(

-        base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[1]),

-        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,

-        &xd->scale_factor[1], 1, 16, 0, &xd->subpix);

-  }

-  xd->mb_to_top_edge = edge[0];

-  xd->mb_to_bottom_edge = edge[1];

-  // Compute consistency metric

-  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);

-  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_mvs;

-  base_pre = xd->pre.y_buffer;

-  pre_stride = xd->pre.y_stride;

-  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;

-  // Now generate the first predictor

-  for (n = 0; n < block_size; n += 16) {

-    xd->mb_to_left_edge   = edge[2] - (n << 3);

-    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&ymv.as_mv, xd);

-    set_scaled_offsets(&xd->scale_factor[0], mb_row * 16, mb_col * 16 + n);

-    // predict a single row of pixels

-    vp9_build_inter_predictor(

-        base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[0]),

-        pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[0],

-        16, 1, 0, &xd->subpix);

-  }

-  xd->mb_to_left_edge = edge[2];

-  xd->mb_to_right_edge = edge[3];

-  for (n = 0; n < block_size; n += 16) {

-    xd->mb_to_top_edge    = edge[0] - (n << 3);

-    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&ymv.as_mv, xd);

-    set_scaled_offsets(&xd->scale_factor[0], mb_row * 16 + n, mb_col * 16);

-    // predict a single col of pixels

-    vp9_build_inter_predictor(

-        base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[0]),

-        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,

-        &xd->scale_factor[0], 1, 16, 0, &xd->subpix);

-  }

-  xd->mb_to_top_edge = edge[0];

-  xd->mb_to_bottom_edge = edge[1];

-  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);

-  // Choose final weight for averaging

-  weight = get_weight(xd, metric_1, metric_2);

-  return weight;

+  foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);

-static void build_inter16x16_predictors_mby_w(MACROBLOCKD *xd,

-                                              uint8_t *dst_y,

-                                              int dst_ystride,

-                                              int weight,

-                                              int mb_row,

-                                              int mb_col) {

-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

-  int which_mv;

-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-    const int clamp_mvs = which_mv ?

-        xd->mode_info_context->mbmi.need_to_clamp_secondmv :

-         xd->mode_info_context->mbmi.need_to_clamp_mvs;

-    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;

-    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;

-    int_mv ymv;

-    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&ymv.as_mv, xd);

-    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);

-    vp9_build_inter_predictor(base_pre, pre_stride,

-                              dst_y, dst_ystride,

-                              &ymv, &xd->scale_factor[which_mv],

-                              16, 16, which_mv ? weight : 0, &xd->subpix);

-  }

-}

-void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                         uint8_t *dst_y,

-                                         int dst_ystride,

-                                         int mb_row,

-                                         int mb_col) {

-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);

-  build_inter16x16_predictors_mby_w(xd, dst_y, dst_ystride, weight,

-                                    mb_row, mb_col);

-}

-#else

-void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                         uint8_t *dst_y,

-                                         int dst_ystride,

-                                         int mb_row,

-                                         int mb_col) {

-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

-  int which_mv;

-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-    const int clamp_mvs = which_mv ?

-         xd->mode_info_context->mbmi.need_to_clamp_secondmv :

-         xd->mode_info_context->mbmi.need_to_clamp_mvs;

-    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;

-    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;

-    int_mv ymv;

-    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&ymv.as_mv, xd);

-    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);

-    vp9_build_inter_predictor(base_pre, pre_stride,

-                              dst_y, dst_ystride,

-                              &ymv, &xd->scale_factor[which_mv],

-                              16, 16, which_mv, &xd->subpix);

-  }

-}

-#endif

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd,

-                                               uint8_t *dst_u,

-                                               uint8_t *dst_v,

-                                               int dst_uvstride,

-                                               int weight,

-                                               int mb_row,

-                                               int mb_col) {

-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

-  int which_mv;

-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-    const int clamp_mvs =

-        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv

-                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;

-    uint8_t *uptr, *vptr;

-    int pre_stride = which_mv ? xd->second_pre.uv_stride

-                              : xd->pre.uv_stride;

-    int_mv _o16x16mv;

-    int_mv _16x16mv;

-    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

-    _o16x16mv = _16x16mv;

-    /* calc uv motion vectors */

-    if (_16x16mv.as_mv.row < 0)

-      _16x16mv.as_mv.row -= 1;

-    else

-      _16x16mv.as_mv.row += 1;

-    if (_16x16mv.as_mv.col < 0)

-      _16x16mv.as_mv.col -= 1;

-    else

-      _16x16mv.as_mv.col += 1;

-    _16x16mv.as_mv.row /= 2;

-    _16x16mv.as_mv.col /= 2;

-    _16x16mv.as_mv.row &= xd->fullpixel_mask;

-    _16x16mv.as_mv.col &= xd->fullpixel_mask;

-    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);

-    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);

-    set_scaled_offsets(&xd->scale_factor_uv[which_mv],

-                       mb_row * 16, mb_col * 16);

-    vp9_build_inter_predictor_q4(

-        uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv,

-        &xd->scale_factor_uv[which_mv], 8, 8,

-        which_mv ? weight : 0, &xd->subpix);

-    vp9_build_inter_predictor_q4(

-        vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv,

-        &xd->scale_factor_uv[which_mv], 8, 8,

-        which_mv ? weight : 0, &xd->subpix);

-  }

-}

-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                          uint8_t *dst_u,

-                                          uint8_t *dst_v,

-                                          int dst_uvstride,

-                                          int mb_row,

-                                          int mb_col) {

-#ifdef USE_IMPLICIT_WEIGHT_UV

-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);

+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,

+                                     int mi_row,

+                                     int mi_col,

+                                     BLOCK_SIZE_TYPE bsize) {

+  struct build_inter_predictors_args args = {

+    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,

+#if CONFIG_ALPHA

+    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf,

+     xd->plane[3].dst.buf},

+    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride,

+     xd->plane[3].dst.stride},

+    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf,

+      xd->plane[3].pre[0].buf},

+     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf,

+      xd->plane[3].pre[1].buf}},

+    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride,

+      xd->plane[3].pre[0].stride},

+     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride,

+      xd->plane[3].pre[1].stride}},

 #else

-  int weight = AVERAGE_WEIGHT;

+    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf},

+    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride},

+    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf},

+     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}},

+    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride},

+     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}},

 #endif

-  build_inter16x16_predictors_mbuv_w(xd, dst_u, dst_v, dst_uvstride,

-                                     weight, mb_row, mb_col);

+  };

+  foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);

+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,

+                                   int mi_row, int mi_col,

+                                   BLOCK_SIZE_TYPE bsize) {

-#else

-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                          uint8_t *dst_u,

-                                          uint8_t *dst_v,

-                                          int dst_uvstride,

-                                          int mb_row,

-                                          int mb_col) {

-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

-  int which_mv;

-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-    const int clamp_mvs =

-        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv

-                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;

-    uint8_t *uptr, *vptr;

-    int pre_stride = which_mv ? xd->second_pre.uv_stride

-                              : xd->pre.uv_stride;

-    int_mv _o16x16mv;

-    int_mv _16x16mv;

-    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

-    if (clamp_mvs)

-      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

-    _o16x16mv = _16x16mv;

-    /* calc uv motion vectors */

-    if (_16x16mv.as_mv.row < 0)

-      _16x16mv.as_mv.row -= 1;

-    else

-      _16x16mv.as_mv.row += 1;

-    if (_16x16mv.as_mv.col < 0)

-      _16x16mv.as_mv.col -= 1;

-    else

-      _16x16mv.as_mv.col += 1;

-    _16x16mv.as_mv.row /= 2;

-    _16x16mv.as_mv.col /= 2;

-    _16x16mv.as_mv.row &= xd->fullpixel_mask;

-    _16x16mv.as_mv.col &= xd->fullpixel_mask;

-    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);

-    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);

-    set_scaled_offsets(&xd->scale_factor_uv[which_mv],

-                       mb_row * 16, mb_col * 16);

-    vp9_build_inter_predictor_q4(

-        uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv,

-        &xd->scale_factor_uv[which_mv], 8, 8,

-        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);

-    vp9_build_inter_predictor_q4(

-        vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv,

-        &xd->scale_factor_uv[which_mv], 8, 8,

-        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);

-  }

+  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);

+  vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);

-#endif

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-static void build_inter32x32_predictors_sby_w(MACROBLOCKD *x,

-                                              uint8_t *dst_y,

-                                              int dst_ystride,

-                                              int weight,

-                                              int mb_row,

-                                              int mb_col) {

-  uint8_t *y1 = x->pre.y_buffer;

-  uint8_t *y2 = x->second_pre.y_buffer;

-  int edge[4], n;

-  edge[0] = x->mb_to_top_edge;

-  edge[1] = x->mb_to_bottom_edge;

-  edge[2] = x->mb_to_left_edge;

-  edge[3] = x->mb_to_right_edge;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);

-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,

-                                                y_idx * 16,

-                                                x->pre.y_stride,

-                                                &x->scale_factor[0]);

-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      x->second_pre.y_buffer = y2 +

-          scaled_buffer_offset(x_idx * 16,

-                               y_idx * 16,

-                               x->second_pre.y_stride,

-                               &x->scale_factor[1]);

-    }

-    build_inter16x16_predictors_mby_w(x,

-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

-        dst_ystride, weight, mb_row + y_idx, mb_col + x_idx);

-  }

-  x->mb_to_top_edge    = edge[0];

-  x->mb_to_bottom_edge = edge[1];

-  x->mb_to_left_edge   = edge[2];

-  x->mb_to_right_edge  = edge[3];

-  x->pre.y_buffer = y1;

-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.y_buffer = y2;

-  }

-}

-void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x,

-                                         uint8_t *dst_y,

-                                         int dst_ystride,

-                                         int mb_row,

-                                         int mb_col) {

-  int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col);

-  build_inter32x32_predictors_sby_w(x, dst_y, dst_ystride, weight,

-                                    mb_row, mb_col);

-}

-#else

-// TODO(all): Can we use 32x32 specific implementations of this rather than

-// using 16x16 implementations ?

-void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x,

-                                         uint8_t *dst_y,

-                                         int dst_ystride,

-                                         int mb_row,

-                                         int mb_col) {

-  uint8_t *y1 = x->pre.y_buffer;

-  uint8_t *y2 = x->second_pre.y_buffer;

-  int edge[4], n;

-  edge[0] = x->mb_to_top_edge;

-  edge[1] = x->mb_to_bottom_edge;

-  edge[2] = x->mb_to_left_edge;

-  edge[3] = x->mb_to_right_edge;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);

-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,

-                                                y_idx * 16,

-                                                x->pre.y_stride,

-                                                &x->scale_factor[0]);

-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      x->second_pre.y_buffer = y2 +

-          scaled_buffer_offset(x_idx * 16,

-                               y_idx * 16,

-                               x->second_pre.y_stride,

-                               &x->scale_factor[1]);

-    }

-    vp9_build_inter16x16_predictors_mby(x,

-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

-        dst_ystride, mb_row + y_idx, mb_col + x_idx);

-  }

-  x->mb_to_top_edge    = edge[0];

-  x->mb_to_bottom_edge = edge[1];

-  x->mb_to_left_edge   = edge[2];

-  x->mb_to_right_edge  = edge[3];

-  x->pre.y_buffer = y1;

-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.y_buffer = y2;

-  }

-}

-#endif

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-static void build_inter32x32_predictors_sbuv_w(MACROBLOCKD *x,

-                                               uint8_t *dst_u,

-                                               uint8_t *dst_v,

-                                               int dst_uvstride,

-                                               int weight,

-                                               int mb_row,

-                                               int mb_col) {

-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;

-  int edge[4], n;

-  edge[0] = x->mb_to_top_edge;

-  edge[1] = x->mb_to_bottom_edge;

-  edge[2] = x->mb_to_left_edge;

-  edge[3] = x->mb_to_right_edge;

-  for (n = 0; n < 4; n++) {

-    int scaled_uv_offset;

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);

-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

-    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

-                                            y_idx * 8,

-                                            x->pre.uv_stride,

-                                            &x->scale_factor_uv[0]);

-    x->pre.u_buffer = u1 + scaled_uv_offset;

-    x->pre.v_buffer = v1 + scaled_uv_offset;

-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

-                                              y_idx * 8,

-                                              x->second_pre.uv_stride,

-                                              &x->scale_factor_uv[1]);

-      x->second_pre.u_buffer = u2 + scaled_uv_offset;

-      x->second_pre.v_buffer = v2 + scaled_uv_offset;

-    }

-    build_inter16x16_predictors_mbuv_w(x,

-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

-        dst_uvstride, weight, mb_row + y_idx, mb_col + x_idx);

-  }

-  x->mb_to_top_edge    = edge[0];

-  x->mb_to_bottom_edge = edge[1];

-  x->mb_to_left_edge   = edge[2];

-  x->mb_to_right_edge  = edge[3];

-  x->pre.u_buffer = u1;

-  x->pre.v_buffer = v1;

-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.u_buffer = u2;

-    x->second_pre.v_buffer = v2;

-  }

-}

-void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *xd,

-                                          uint8_t *dst_u,

-                                          uint8_t *dst_v,

-                                          int dst_uvstride,

-                                          int mb_row,

-                                          int mb_col) {

-#ifdef USE_IMPLICIT_WEIGHT_UV

-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);

-#else

-  int weight = AVERAGE_WEIGHT;

-#endif

-  build_inter32x32_predictors_sbuv_w(xd, dst_u, dst_v, dst_uvstride,

-                                     weight, mb_row, mb_col);

-}

-#else

-void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *x,

-                                          uint8_t *dst_u,

-                                          uint8_t *dst_v,

-                                          int dst_uvstride,

-                                          int mb_row,

-                                          int mb_col) {

-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;

-  int edge[4], n;

-  edge[0] = x->mb_to_top_edge;

-  edge[1] = x->mb_to_bottom_edge;

-  edge[2] = x->mb_to_left_edge;

-  edge[3] = x->mb_to_right_edge;

-  for (n = 0; n < 4; n++) {

-    int scaled_uv_offset;

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);

-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

-    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

-                                            y_idx * 8,

-                                            x->pre.uv_stride,

-                                            &x->scale_factor_uv[0]);

-    x->pre.u_buffer = u1 + scaled_uv_offset;

-    x->pre.v_buffer = v1 + scaled_uv_offset;

-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

-                                              y_idx * 8,

-                                              x->second_pre.uv_stride,

-                                              &x->scale_factor_uv[1]);

-      x->second_pre.u_buffer = u2 + scaled_uv_offset;

-      x->second_pre.v_buffer = v2 + scaled_uv_offset;

-    }

-    vp9_build_inter16x16_predictors_mbuv(x,

-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

-        dst_uvstride, mb_row + y_idx, mb_col + x_idx);

-  }

-  x->mb_to_top_edge    = edge[0];

-  x->mb_to_bottom_edge = edge[1];

-  x->mb_to_left_edge   = edge[2];

-  x->mb_to_right_edge  = edge[3];

-  x->pre.u_buffer = u1;

-  x->pre.v_buffer = v1;

-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.u_buffer = u2;

-    x->second_pre.v_buffer = v2;

-  }

-}

-#endif

-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride,

-                                        int mb_row,

-                                        int mb_col) {

-  vp9_build_inter32x32_predictors_sby(x, dst_y, dst_ystride,

-                                      mb_row, mb_col);

-  vp9_build_inter32x32_predictors_sbuv(x, dst_u, dst_v, dst_uvstride,

-                                      mb_row, mb_col);

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

-    vp9_build_interintra_32x32_predictors_sb(

-        x, dst_y, dst_u, dst_v, dst_ystride, dst_uvstride);

-  }

-#endif

-}

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-static void build_inter64x64_predictors_sby_w(MACROBLOCKD *x,

-                                              uint8_t *dst_y,

-                                              int dst_ystride,

-                                              int weight,

-                                              int mb_row,

-                                              int mb_col) {

-  uint8_t *y1 = x->pre.y_buffer;

-  uint8_t *y2 = x->second_pre.y_buffer;

-  int edge[4], n;

-  edge[0] = x->mb_to_top_edge;

-  edge[1] = x->mb_to_bottom_edge;

-  edge[2] = x->mb_to_left_edge;

-  edge[3] = x->mb_to_right_edge;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);

-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);

-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);

-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);

-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,

-                                                y_idx * 32,

-                                                x->pre.y_stride,

-                                                &x->scale_factor[0]);

-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      x->second_pre.y_buffer = y2 +

-          scaled_buffer_offset(x_idx * 32,

-                               y_idx * 32,

-                               x->second_pre.y_stride,

-                               &x->scale_factor[1]);

-    }

-    build_inter32x32_predictors_sby_w(x,

-        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,

-        dst_ystride, weight, mb_row + y_idx * 2, mb_col + x_idx * 2);

-  }

-  x->mb_to_top_edge    = edge[0];

-  x->mb_to_bottom_edge = edge[1];

-  x->mb_to_left_edge   = edge[2];

-  x->mb_to_right_edge  = edge[3];

-  x->pre.y_buffer = y1;

-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.y_buffer = y2;

-  }

-}

-void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x,

-                                         uint8_t *dst_y,

-                                         int dst_ystride,

-                                         int mb_row,

-                                         int mb_col) {

-  int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col);

-  build_inter64x64_predictors_sby_w(x, dst_y, dst_ystride, weight,

-                                    mb_row, mb_col);

-}

-#else

-void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x,

-                                         uint8_t *dst_y,

-                                         int dst_ystride,

-                                         int mb_row,

-                                         int mb_col) {

-  uint8_t *y1 = x->pre.y_buffer;

-  uint8_t *y2 = x->second_pre.y_buffer;

-  int edge[4], n;

-  edge[0] = x->mb_to_top_edge;

-  edge[1] = x->mb_to_bottom_edge;

-  edge[2] = x->mb_to_left_edge;

-  edge[3] = x->mb_to_right_edge;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);

-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);

-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);

-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);

-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,

-                                                y_idx * 32,

-                                                x->pre.y_stride,

-                                                &x->scale_factor[0]);

-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      x->second_pre.y_buffer = y2 +

-          scaled_buffer_offset(x_idx * 32,

-                               y_idx * 32,

-                               x->second_pre.y_stride,

-                               &x->scale_factor[1]);

-    }

-    vp9_build_inter32x32_predictors_sby(x,

-        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,

-        dst_ystride, mb_row + y_idx * 2, mb_col + x_idx * 2);

-  }

-  x->mb_to_top_edge    = edge[0];

-  x->mb_to_bottom_edge = edge[1];

-  x->mb_to_left_edge   = edge[2];

-  x->mb_to_right_edge  = edge[3];

-  x->pre.y_buffer = y1;

-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.y_buffer = y2;

-  }

-}

-#endif

-void vp9_build_inter64x64_predictors_sbuv(MACROBLOCKD *x,

-                                          uint8_t *dst_u,

-                                          uint8_t *dst_v,

-                                          int dst_uvstride,

-                                          int mb_row,

-                                          int mb_col) {

-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;

-  int edge[4], n;

-  edge[0] = x->mb_to_top_edge;

-  edge[1] = x->mb_to_bottom_edge;

-  edge[2] = x->mb_to_left_edge;

-  edge[3] = x->mb_to_right_edge;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    int scaled_uv_offset;

-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);

-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);

-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);

-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);

-    scaled_uv_offset = scaled_buffer_offset(x_idx * 16,

-                                            y_idx * 16,

-                                            x->pre.uv_stride,

-                                            &x->scale_factor_uv[0]);

-    x->pre.u_buffer = u1 + scaled_uv_offset;

-    x->pre.v_buffer = v1 + scaled_uv_offset;

-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      scaled_uv_offset = scaled_buffer_offset(x_idx * 16,

-                                              y_idx * 16,

-                                              x->second_pre.uv_stride,

-                                              &x->scale_factor_uv[1]);

-      x->second_pre.u_buffer = u2 + scaled_uv_offset;

-      x->second_pre.v_buffer = v2 + scaled_uv_offset;

-    }

-    vp9_build_inter32x32_predictors_sbuv(x,

-        dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,

-        dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,

-        dst_uvstride, mb_row + y_idx * 2, mb_col + x_idx * 2);

-  }

-  x->mb_to_top_edge    = edge[0];

-  x->mb_to_bottom_edge = edge[1];

-  x->mb_to_left_edge   = edge[2];

-  x->mb_to_right_edge  = edge[3];

-  x->pre.u_buffer = u1;

-  x->pre.v_buffer = v1;

-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.u_buffer = u2;

-    x->second_pre.v_buffer = v2;

-  }

-}

-void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride,

-                                        int mb_row,

-                                        int mb_col) {

-  vp9_build_inter64x64_predictors_sby(x, dst_y, dst_ystride,

-                                      mb_row, mb_col);

-  vp9_build_inter64x64_predictors_sbuv(x, dst_u, dst_v, dst_uvstride,

-                                       mb_row, mb_col);

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

-    vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v,

-                                             dst_ystride, dst_uvstride);

-  }

-#endif

-}

-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd,

-                                         int mb_row, int mb_col) {

-  int i;

-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  BLOCKD *blockd = xd->block;

-  int which_mv = 0;

-  const int use_second_ref = mbmi->second_ref_frame > 0;

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && defined(USE_IMPLICIT_WEIGHT_SPLITMV)

-  int weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);

-#else

-  int weight = AVERAGE_WEIGHT;

-#endif

-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

-    for (i = 0; i < 16; i += 8) {

-      BLOCKD *d0 = &blockd[i];

-      BLOCKD *d1 = &blockd[i + 2];

-      const int y = i & 8;

-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];

-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-        if (mbmi->need_to_clamp_mvs) {

-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);

-          clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);

-        }

-        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 8, 16, which_mv,

-                                  which_mv ? weight : 0,

-                                  &xd->subpix, mb_row * 16 + y, mb_col * 16);

-      }

-    }

-  } else {

-    for (i = 0; i < 16; i += 2) {

-      BLOCKD *d0 = &blockd[i];

-      BLOCKD *d1 = &blockd[i + 1];

-      const int x = (i & 3) * 4;

-      const int y = (i >> 2) * 4;

-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 4, 16, which_mv,

-                                  which_mv ? weight : 0,

-                                  &xd->subpix,

-                                  mb_row * 16 + y, mb_col * 16 + x);

-      }

-    }

-  }

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-#if !defined(USE_IMPLICIT_WEIGHT_UV)

-  weight = AVERAGE_WEIGHT;

-#endif

-#endif

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *d0 = &blockd[i];

-    BLOCKD *d1 = &blockd[i + 1];

-    const int x = 4 * (i & 1);

-    const int y = ((i - 16) >> 1) * 4;

-    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,

-                                which_mv ? weight : 0, &xd->subpix,

-                                mb_row * 8 + y, mb_col * 8 + x);

-    }

-  }

-}

-static INLINE int round_mv_comp(int value) {

-  return (value < 0 ? value - 4 : value + 4) / 8;

-}

-static int mi_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {

-  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +

-                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row +

-                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.row +

-                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.row;

-  return round_mv_comp(temp) & mb->fullpixel_mask;

-}

-static int mi_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {

-  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col +

-                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col +

-                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.col +

-                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.col;

-  return round_mv_comp(temp) & mb->fullpixel_mask;

-}

-static int b_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {

-  BLOCKD *const blockd = mb->block;

-  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.row +

-                   blockd[off + 1].bmi.as_mv[idx].as_mv.row +

-                   blockd[off + 4].bmi.as_mv[idx].as_mv.row +

-                   blockd[off + 5].bmi.as_mv[idx].as_mv.row;

-  return round_mv_comp(temp) & mb->fullpixel_mask;

-}

-static int b_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {

-  BLOCKD *const blockd = mb->block;

-  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.col +

-                   blockd[off + 1].bmi.as_mv[idx].as_mv.col +

-                   blockd[off + 4].bmi.as_mv[idx].as_mv.col +

-                   blockd[off + 5].bmi.as_mv[idx].as_mv.col;

-  return round_mv_comp(temp) & mb->fullpixel_mask;

-}

-static void build_4x4uvmvs(MACROBLOCKD *xd) {

-  int i, j;

-  BLOCKD *blockd = xd->block;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      const int yoffset = i * 8 + j * 2;

-      const int uoffset = 16 + i * 2 + j;

-      const int voffset = 20 + i * 2 + j;

-      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;

-      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;

-      u->row = mi_mv_pred_row(xd, yoffset, 0);

-      u->col = mi_mv_pred_col(xd, yoffset, 0);

-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

-      clamp_uvmv_to_umv_border(u, xd);

-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

-      clamp_uvmv_to_umv_border(u, xd);

-      v->row = u->row;

-      v->col = u->col;

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-        u = &blockd[uoffset].bmi.as_mv[1].as_mv;

-        v = &blockd[voffset].bmi.as_mv[1].as_mv;

-        u->row = mi_mv_pred_row(xd, yoffset, 1);

-        u->col = mi_mv_pred_col(xd, yoffset, 1);

-        // if (mbmi->need_to_clamp_mvs)

-        clamp_uvmv_to_umv_border(u, xd);

-        // if (mbmi->need_to_clamp_mvs)

-        clamp_uvmv_to_umv_border(u, xd);

-        v->row = u->row;

-        v->col = u->col;

-      }

-    }

-  }

-}

-void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride,

-                                        int mb_row,

-                                        int mb_col) {

-  vp9_build_inter16x16_predictors_mby(xd, dst_y, dst_ystride, mb_row, mb_col);

-  vp9_build_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride,

-                                       mb_row, mb_col);

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

-    vp9_build_interintra_16x16_predictors_mb(xd, dst_y, dst_u, dst_v,

-                                             dst_ystride, dst_uvstride);

-  }

-#endif

-}

-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,

-                                   int mb_row,

-                                   int mb_col) {

-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

-    vp9_build_inter16x16_predictors_mb(xd, xd->predictor,

-                                       &xd->predictor[256],

-                                       &xd->predictor[320], 16, 8,

-                                       mb_row, mb_col);

-  } else {

-    build_4x4uvmvs(xd);

-    build_inter4x4_predictors_mb(xd, mb_row, mb_col);

-  }

-}

 /*encoder only*/

 void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,

                                         int mb_row, int mb_col) {

-  int i, j, weight;

-  BLOCKD *const blockd = xd->block;

+  vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,

+                                  BLOCK_SIZE_MB16X16);

+}

-  /* build uv mvs */

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      const int yoffset = i * 8 + j * 2;

-      const int uoffset = 16 + i * 2 + j;

-      const int voffset = 20 + i * 2 + j;

-      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;

-      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;

-      v->row = u->row = b_mv_pred_row(xd, yoffset, 0);

-      v->col = u->col = b_mv_pred_col(xd, yoffset, 0);

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-        u = &blockd[uoffset].bmi.as_mv[1].as_mv;

-        v = &blockd[voffset].bmi.as_mv[1].as_mv;

-        v->row = u->row = b_mv_pred_row(xd, yoffset, 1);

-        v->row = u->col = b_mv_pred_row(xd, yoffset, 1);

-      }

-    }

+// TODO(dkovalev: find better place for this function)

+void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {

+  const int ref = cm->active_ref_idx[i];

+  struct scale_factors *const sf = &cm->active_ref_scale[i];

+  if (ref >= NUM_YV12_BUFFERS) {

+    memset(sf, 0, sizeof(*sf));

+  } else {

+    YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];

+    vp9_setup_scale_factors_for_frame(sf,

+                                      fb->y_crop_width, fb->y_crop_height,

+                                      cm->width, cm->height);

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && \

-  defined(USE_IMPLICIT_WEIGHT_SPLITMV) && \

-  defined(USE_IMPLICIT_WEIGHT_UV)

-  weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);

-#else

-  weight = AVERAGE_WEIGHT;

-#endif

-  for (i = 16; i < 24; i += 2) {

-    const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

-    const int x = 4 * (i & 1);

-    const int y = ((i - 16) >> 1) * 4;

-    int which_mv;

-    BLOCKD *d0 = &blockd[i];

-    BLOCKD *d1 = &blockd[i + 1];

-    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,

-                                which_mv ? weight : 0,

-                                &xd->subpix, mb_row * 8 + y, mb_col * 8 + x);

-    }

-  }

--- a/vp9/common/vp9_reconinter.h

+++ b/vp9/common/vp9_reconinter.h

@@ -15,61 +15,26 @@

 #include "vp9/common/vp9_onyxc_int.h"

 struct subpix_fn_table;

+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,

+                                    int mb_row,

+                                    int mb_col,

+                                    BLOCK_SIZE_TYPE bsize);

-void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                         uint8_t *dst_y,

-                                         int dst_ystride,

-                                         int mb_row,

-                                         int mb_col);

+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,

+                                     int mb_row,

+                                     int mb_col,

+                                     BLOCK_SIZE_TYPE bsize);

-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                          uint8_t *dst_u,

-                                          uint8_t *dst_v,

-                                          int dst_uvstride,

-                                          int mb_row,

-                                          int mb_col);

+void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,

+                                   int mb_row, int mb_col,

+                                   BLOCK_SIZE_TYPE bsize);

-void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride,

-                                        int mb_row,

-                                        int mb_col);

-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride,

-                                        int mb_row,

-                                        int mb_col);

-void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride,

-                                        int mb_row,

-                                        int mb_col);

-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,

-                                   int mb_row,

-                                   int mb_col);

-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,

-                                        int mb_row,

-                                        int mb_col);

 void vp9_setup_interp_filters(MACROBLOCKD *xd,

                               INTERPOLATIONFILTERTYPE filter,

                               VP9_COMMON *cm);

 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,

-                                       YV12_BUFFER_CONFIG *other,

+                                       int other_w, int other_h,

                                        int this_w, int this_h);

 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,

@@ -81,51 +46,73 @@

 void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,

                                   uint8_t *dst, int dst_stride,

-                                  const int_mv *fullpel_mv_q3,

-                                  const int_mv *frac_mv_q4,

+                                  const int_mv *mv_q4,

                                   const struct scale_factors *scale,

                                   int w, int h, int do_avg,

                                   const struct subpix_fn_table *subpix);

-static int scale_value_x(int val, const struct scale_factors *scale) {

-  return val * scale->x_num / scale->x_den;

+static int scaled_buffer_offset(int x_offset, int y_offset, int stride,

+                                const struct scale_factors *scale) {

+  const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;

+  const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset;

+  return y * stride + x;

-static int scale_value_y(int val, const struct scale_factors *scale) {

-  return val * scale->y_num / scale->y_den;

+static void setup_pred_plane(struct buf_2d *dst,

+                             uint8_t *src, int stride,

+                             int mi_row, int mi_col,

+                             const struct scale_factors *scale,

+                             int subsampling_x, int subsampling_y) {

+  const int x = (MI_SIZE * mi_col) >> subsampling_x;

+  const int y = (MI_SIZE * mi_row) >> subsampling_y;

+  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);

+  dst->stride = stride;

-static int scaled_buffer_offset(int x_offset,

-                                int y_offset,

-                                int stride,

-                                const struct scale_factors *scale) {

-  return scale_value_y(y_offset, scale) * stride +

-      scale_value_x(x_offset, scale);

+// TODO(jkoleszar): audit all uses of this that don't set mb_row, mb_col

+static void setup_dst_planes(MACROBLOCKD *xd,

+                             const YV12_BUFFER_CONFIG *src,

+                             int mi_row, int mi_col) {

+  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,

+                         src->alpha_buffer};

+  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,

+                    src->alpha_stride};

+  int i;

+  for (i = 0; i < MAX_MB_PLANE; ++i) {

+    struct macroblockd_plane *pd = &xd->plane[i];

+    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,

+                     pd->subsampling_x, pd->subsampling_y);

+  }

-static void setup_pred_block(YV12_BUFFER_CONFIG *dst,

-                             const YV12_BUFFER_CONFIG *src,

-                             int mb_row, int mb_col,

+static void setup_pre_planes(MACROBLOCKD *xd,

+                             const YV12_BUFFER_CONFIG *src0,

+                             const YV12_BUFFER_CONFIG *src1,

+                             int mi_row, int mi_col,

                              const struct scale_factors *scale,

                              const struct scale_factors *scale_uv) {

-  const int recon_y_stride = src->y_stride;

-  const int recon_uv_stride = src->uv_stride;

-  int recon_yoffset;

-  int recon_uvoffset;

+  const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1};

+  int i, j;

-  if (scale) {

-    recon_yoffset = scaled_buffer_offset(16 * mb_col, 16 * mb_row,

-                                         recon_y_stride, scale);

-    recon_uvoffset = scaled_buffer_offset(8 * mb_col, 8 * mb_row,

-                                          recon_uv_stride, scale_uv);

-  } else {

-    recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;

-    recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;

+  for (i = 0; i < 2; ++i) {

+    const YV12_BUFFER_CONFIG *src = srcs[i];

+    if (src) {

+      uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,

+                             src->alpha_buffer};

+      int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,

+                        src->alpha_stride};

+      for (j = 0; j < MAX_MB_PLANE; ++j) {

+        struct macroblockd_plane *pd = &xd->plane[j];

+        const struct scale_factors *sf = j ? scale_uv : scale;

+        setup_pred_plane(&pd->pre[i],

+                         buffers[j], strides[j],

+                         mi_row, mi_col, sf ? &sf[i] : NULL,

+                         pd->subsampling_x, pd->subsampling_y);

+      }

+    }

-  *dst = *src;

-  dst->y_buffer += recon_yoffset;

-  dst->u_buffer += recon_uvoffset;

-  dst->v_buffer += recon_uvoffset;

 static void set_scale_factors(MACROBLOCKD *xd,

@@ -137,5 +124,7 @@

   xd->scale_factor_uv[0] = xd->scale_factor[0];

   xd->scale_factor_uv[1] = xd->scale_factor[1];

+void vp9_setup_scale_factors(VP9_COMMON *cm, int i);

 #endif  // VP9_COMMON_VP9_RECONINTER_H_

--- a/vp9/common/vp9_reconintra.c

+++ b/vp9/common/vp9_reconintra.c

@@ -13,773 +13,345 @@

 #include "./vpx_config.h"

 #include "vp9_rtcd.h"

 #include "vp9/common/vp9_reconintra.h"

+#include "vp9/common/vp9_onyxc_int.h"

 #include "vpx_mem/vpx_mem.h"

-// For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)

-// and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).

-// Using multiplication and shifting instead of division in diagonal prediction.

-// iscale table is calculated from ((1 << 16) + (i + 2) / 2) / (i+2) and used as

-// ((A + B) * iscale[i] + (1 << 15)) >> 16;

-// where A and B are weighted pixel values.

-static const unsigned int iscale[64] = {

-  32768, 21845, 16384, 13107, 10923,  9362,  8192,  7282,

-   6554,  5958,  5461,  5041,  4681,  4369,  4096,  3855,

-   3641,  3449,  3277,  3121,  2979,  2849,  2731,  2621,

-   2521,  2427,  2341,  2260,  2185,  2114,  2048,  1986,

-   1928,  1872,  1820,  1771,  1725,  1680,  1638,  1598,

-   1560,  1524,  1489,  1456,  1425,  1394,  1365,  1337,

-   1311,  1285,  1260,  1237,  1214,  1192,  1170,  1150,

-   1130,  1111,  1092,  1074,  1057,  1040,  1024,  1008,

-};

-static INLINE int iscale_round(int value, int i) {

-    return ROUND_POWER_OF_TWO(value * iscale[i], 16);

-}

-static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+static void d27_predictor(uint8_t *ypred_ptr, int y_stride,

+                          int bw, int bh,

                           uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

-  r = 0;

-  for (c = 0; c < n - 2; c++) {

-    int a = c & 1 ? yleft_col[r + 1]

-                  : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);

-    int b = yabove_row[c + 2];

-    ypred_ptr[c] = iscale_round(2 * a + (c + 1) * b, 1 + c);

+  // first column

+  for (r = 0; r < bh - 1; ++r) {

+      ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +

+                                                   yleft_col[r + 1], 1);

-  for (r = 1; r < n / 2 - 1; r++) {

-    for (c = 0; c < n - 2 - 2 * r; c++) {

-      int a = c & 1 ? yleft_col[r + 1]

-                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);

-      int b = ypred_ptr[(r - 1) * y_stride + c + 2];

-      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);

-    }

+  ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];

+  ypred_ptr++;

+  // second column

+  for (r = 0; r < bh - 2; ++r) {

+      ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +

+                                                   yleft_col[r + 1] * 2 +

+                                                   yleft_col[r + 2], 2);

+  ypred_ptr[(bh - 2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[bh - 2] +

+                                                      yleft_col[bh - 1] * 3,

+                                                      2);

+  ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];

+  ypred_ptr++;

-  for (; r < n - 1; r++) {

-    for (c = 0; c < n; c++) {

-      int v = c & 1 ? yleft_col[r + 1]

-                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);

-      int h = r - c / 2;

-      ypred_ptr[h * y_stride + c] = v;

-    }

+  // rest of last row

+  for (c = 0; c < bw - 2; ++c) {

+    ypred_ptr[(bh - 1) * y_stride + c] = yleft_col[bh-1];

-  c = 0;

-  r = n - 1;

-  ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride] +

-                                               yleft_col[r], 1);

-  for (r = n - 2; r >= n / 2; --r) {

-    int w = c + (n - 1 - r) * 2;

-    ypred_ptr[r * y_stride + w] =

-        ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +

-                           ypred_ptr[r * y_stride + w - 1], 1);

-  }

-  for (c = 1; c < n; c++) {

-    for (r = n - 1; r >= n / 2 + c / 2; --r) {

-      int w = c + (n - 1 - r) * 2;

-      ypred_ptr[r * y_stride + w] =

-          ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +

-                             ypred_ptr[r * y_stride + w - 1], 1);

+  for (r = bh - 2; r >= 0; --r) {

+    for (c = 0; c < bw - 2; ++c) {

+      ypred_ptr[r * y_stride + c] = ypred_ptr[(r + 1) * y_stride + c - 2];

-static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+static void d63_predictor(uint8_t *ypred_ptr, int y_stride,

+                          int bw, int bh,

                           uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

-  c = 0;

-  for (r = 0; r < n - 2; r++) {

-    int a = r & 1 ? yabove_row[c + 1]

-                  : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);

-    int b = yleft_col[r + 2];

-    ypred_ptr[r * y_stride] = iscale_round(2 * a + (r + 1) * b, 1 + r);

-  }

-  for (c = 1; c < n / 2 - 1; c++) {

-    for (r = 0; r < n - 2 - 2 * c; r++) {

-      int a = r & 1 ? yabove_row[c + 1]

-                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);

-      int b = ypred_ptr[(r + 2) * y_stride + c - 1];

-      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);

+  for (r = 0; r < bh; ++r) {

+    for (c = 0; c < bw; ++c) {

+      if (r & 1) {

+        ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +

+                                          yabove_row[r/2 + c + 1] * 2 +

+                                          yabove_row[r/2 + c + 2], 2);

+      } else {

+        ypred_ptr[c] =ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +

+                                         yabove_row[r/2+ c + 1], 1);

+      }

+    ypred_ptr += y_stride;

-  for (; c < n - 1; ++c) {

-    for (r = 0; r < n; r++) {

-      int v = r & 1 ? yabove_row[c + 1]

-                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);

-      int w = c - r / 2;

-      ypred_ptr[r * y_stride + w] = v;

-    }

-  }

-  r = 0;

-  c = n - 1;

-  ypred_ptr[c] = ROUND_POWER_OF_TWO(ypred_ptr[(c - 1)] + yabove_row[c], 1);

-  for (c = n - 2; c >= n / 2; --c) {

-    int h = r + (n - 1 - c) * 2;

-    ypred_ptr[h * y_stride + c] =

-         ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +

-                            ypred_ptr[(h - 1) * y_stride + c], 1);

-  }

-  for (r = 1; r < n; r++) {

-    for (c = n - 1; c >= n / 2 + r / 2; --c) {

-      int h = r + (n - 1 - c) * 2;

-      ypred_ptr[h * y_stride + c] =

-          ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +

-                             ypred_ptr[(h - 1) * y_stride + c], 1);

-    }

-  }

-static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+static void d45_predictor(uint8_t *ypred_ptr, int y_stride,

+                          int bw, int bh,

                           uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

-  for (r = 0; r < n - 1; ++r) {

-    for (c = 0; c <= r; ++c) {

-      ypred_ptr[(r - c) * y_stride + c] = iscale_round(

-          yabove_row[r + 1] * (c + 1) + yleft_col[r + 1] * (r - c + 1), r);

+  for (r = 0; r < bh; ++r) {

+    for (c = 0; c < bw; ++c) {

+      if (r + c + 2 < bw * 2)

+        ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r + c] +

+                                          yabove_row[r + c + 1] * 2 +

+                                          yabove_row[r + c + 2], 2);

+      else

+        ypred_ptr[c] = yabove_row[bw * 2 - 1];

+    ypred_ptr += y_stride;

-  for (c = 0; c <= r; ++c) {

-    int yabove_ext = yabove_row[r];  // clip_pixel(2 * yabove_row[r] -

-                                     //            yabove_row[r - 1]);

-    int yleft_ext = yleft_col[r];  // clip_pixel(2 * yleft_col[r] -

-                                   //            yleft_col[r-1]);

-    ypred_ptr[(r - c) * y_stride + c] =

-         iscale_round(yabove_ext * (c + 1) + yleft_ext * (r - c + 1), r);

-  }

-  for (r = 1; r < n; ++r) {

-    for (c = n - r; c < n; ++c) {

-      const int yabove_ext = ypred_ptr[(r - 1) * y_stride + c];

-      const int yleft_ext = ypred_ptr[r * y_stride + c - 1];

-      ypred_ptr[r * y_stride + c] =

-          ROUND_POWER_OF_TWO(yabove_ext + yleft_ext, 1);

-    }

-  }

-static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+static void d117_predictor(uint8_t *ypred_ptr, int y_stride,

+                           int bw, int bh,

                            uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

-  for (c = 0; c < n; c++)

+  // first row

+  for (c = 0; c < bw; c++)

     ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1);

   ypred_ptr += y_stride;

-  for (c = 0; c < n; c++)

-    ypred_ptr[c] = yabove_row[c - 1];

+  // second row

+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +

+                                    yabove_row[-1] * 2 +

+                                    yabove_row[0], 2);

+  for (c = 1; c < bw; c++)

+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +

+                                      yabove_row[c - 1] * 2 +

+                                      yabove_row[c], 2);

   ypred_ptr += y_stride;

-  for (r = 2; r < n; ++r) {

-    ypred_ptr[0] = yleft_col[r - 2];

-    for (c = 1; c < n; c++)

+  // the rest of first col

+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] +

+                                    yleft_col[0] * 2 +

+                                    yleft_col[1], 2);

+  for (r = 3; r < bh; ++r)

+    ypred_ptr[(r-2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 3] +

+                                                     yleft_col[r - 2] * 2 +

+                                                     yleft_col[r - 1], 2);

+  // the rest of the block

+  for (r = 2; r < bh; ++r) {

+    for (c = 1; c < bw; c++)

       ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];

     ypred_ptr += y_stride;

-static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+static void d135_predictor(uint8_t *ypred_ptr, int y_stride,

+                           int bw, int bh,

                            uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

-  ypred_ptr[0] = yabove_row[-1];

-  for (c = 1; c < n; c++)

-    ypred_ptr[c] = yabove_row[c - 1];

-  for (r = 1; r < n; ++r)

-    ypred_ptr[r * y_stride] = yleft_col[r - 1];

+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +

+                                    yabove_row[-1] * 2 +

+                                    yabove_row[0], 2);

+  for (c = 1; c < bw; c++)

+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +

+                                      yabove_row[c - 1] * 2 +

+                                      yabove_row[c], 2);

+  ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +

+                                           yleft_col[0] * 2 +

+                                           yleft_col[1], 2);

+  for (r = 2; r < bh; ++r)

+    ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +

+                                                 yleft_col[r - 1] * 2 +

+                                                 yleft_col[r], 2);

   ypred_ptr += y_stride;

-  for (r = 1; r < n; ++r) {

-    for (c = 1; c < n; c++) {

+  for (r = 1; r < bh; ++r) {

+    for (c = 1; c < bw; c++)

       ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];

-    }

     ypred_ptr += y_stride;

-static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                           uint8_t *yabove_row, uint8_t *yleft_col) {

+static void d153_predictor(uint8_t *ypred_ptr,

+                           int y_stride,

+                           int bw, int bh,

+                           uint8_t *yabove_row,

+                           uint8_t *yleft_col) {

   int r, c;

   ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1);

-  for (r = 1; r < n; r++)

+  for (r = 1; r < bh; r++)

     ypred_ptr[r * y_stride] =

         ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1);

   ypred_ptr++;

-  ypred_ptr[0] = yabove_row[-1];

-  for (r = 1; r < n; r++)

-    ypred_ptr[r * y_stride] = yleft_col[r - 1];

+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +

+                                    yabove_row[-1] * 2 +

+                                    yabove_row[0], 2);

+  ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +

+                                           yleft_col[0] * 2 +

+                                           yleft_col[1], 2);

+  for (r = 2; r < bh; r++)

+    ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +

+                                                 yleft_col[r - 1] * 2 +

+                                                 yleft_col[r], 2);

   ypred_ptr++;

-  for (c = 0; c < n - 2; c++)

-    ypred_ptr[c] = yabove_row[c];

+  for (c = 0; c < bw - 2; c++)

+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] +

+                                      yabove_row[c] * 2 +

+                                      yabove_row[c + 1], 2);

   ypred_ptr += y_stride;

-  for (r = 1; r < n; ++r) {

-    for (c = 0; c < n - 2; c++)

+  for (r = 1; r < bh; ++r) {

+    for (c = 0; c < bw - 2; c++)

       ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];

     ypred_ptr += y_stride;

-static void corner_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                             uint8_t *yabove_row,

-                             uint8_t *yleft_col) {

-  int mh, mv, maxgradh, maxgradv, x, y, nx, ny;

-  int i, j;

-  int top_left = yabove_row[-1];

-  mh = mv = 0;

-  maxgradh = yabove_row[1] - top_left;

-  maxgradv = yleft_col[1] - top_left;

-  for (i = 2; i < n; ++i) {

-    int gh = yabove_row[i] - yabove_row[i - 2];

-    int gv = yleft_col[i] - yleft_col[i - 2];

-    if (gh > maxgradh) {

-      maxgradh = gh;

-      mh = i - 1;

-    }

-    if (gv > maxgradv) {

-      maxgradv = gv;

-      mv = i - 1;

-    }

-  }

-  nx = mh + mv + 3;

-  ny = 2 * n + 1 - nx;

-  x = top_left;

-  for (i = 0; i <= mh; ++i) x += yabove_row[i];

-  for (i = 0; i <= mv; ++i) x += yleft_col[i];

-  x += (nx >> 1);

-  x /= nx;

-  y = 0;

-  for (i = mh + 1; i < n; ++i) y += yabove_row[i];

-  for (i = mv + 1; i < n; ++i) y += yleft_col[i];

-  y += (ny >> 1);

-  y /= ny;

-  for (i = 0; i < n; ++i) {

-    for (j = 0; j < n; ++j)

-      ypred_ptr[j] = (i <= mh && j <= mv ? x : y);

-    ypred_ptr += y_stride;

-  }

-}

-void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {

-  int i;

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-}

-static INLINE int log2_minus_1(int n) {

-  switch (n) {

-    case 4: return 1;

-    case 8: return 2;

-    case 16: return 3;

-    case 32: return 4;

-    case 64: return 5;

-    default:

-      assert(0);

-      return 0;

-  }

-}

-void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride,

-                                         uint8_t *ypred_ptr,

-                                         int y_stride, int mode, int bsize,

-                                         int up_available, int left_available,

-                                         int right_available) {

+void vp9_build_intra_predictors(uint8_t *src, int src_stride,

+                                uint8_t *ypred_ptr,

+                                int y_stride, int mode,

+                                int bw, int bh,

+                                int up_available, int left_available,

+                                int right_available) {

   int r, c, i;

-  uint8_t yleft_col[64], yabove_data[65], ytop_left;

+  uint8_t yleft_col[64], yabove_data[129], ytop_left;

   uint8_t *yabove_row = yabove_data + 1;

-  /*

-   * 127 127 127 .. 127 127 127 127 127 127

-   * 129  A   B  ..  Y   Z

-   * 129  C   D  ..  W   X

-   * 129  E   F  ..  U   V

-   * 129  G   H  ..  S   T   T   T   T   T

-   *  ..

-   */

+  // 127 127 127 .. 127 127 127 127 127 127

+  // 129  A   B  ..  Y   Z

+  // 129  C   D  ..  W   X

+  // 129  E   F  ..  U   V

+  // 129  G   H  ..  S   T   T   T   T   T

+  // ..

+  assert(bw == bh);

   if (left_available) {

-    for (i = 0; i < bsize; i++)

+    for (i = 0; i < bh; i++)

       yleft_col[i] = src[i * src_stride - 1];

   } else {

-    vpx_memset(yleft_col, 129, bsize);

+    vpx_memset(yleft_col, 129, bh);

   if (up_available) {

     uint8_t *yabove_ptr = src - src_stride;

-    vpx_memcpy(yabove_row, yabove_ptr, bsize);

-    if (left_available) {

-      ytop_left = yabove_ptr[-1];

-    } else {

-      ytop_left = 127;

-    }

+    vpx_memcpy(yabove_row, yabove_ptr, bw);

+    if (bw == 4 && right_available)

+      vpx_memcpy(yabove_row + bw, yabove_ptr + bw, bw);

+    else

+      vpx_memset(yabove_row + bw, yabove_row[bw -1], bw);

+    ytop_left = left_available ? yabove_ptr[-1] : 129;

   } else {

-    vpx_memset(yabove_row, 127, bsize);

+    vpx_memset(yabove_row, 127, bw * 2);

     ytop_left = 127;

   yabove_row[-1] = ytop_left;

-  /* for Y */

   switch (mode) {

     case DC_PRED: {

-      int expected_dc;

       int i;

-      int shift;

+      int expected_dc = 128;

       int average = 0;

-      int log2_bsize_minus_1 = log2_minus_1(bsize);

+      int count = 0;

       if (up_available || left_available) {

         if (up_available) {

-          for (i = 0; i < bsize; i++) {

+          for (i = 0; i < bw; i++)

             average += yabove_row[i];

-          }

+          count += bw;

         if (left_available) {

-          for (i = 0; i < bsize; i++) {

+          for (i = 0; i < bh; i++)

             average += yleft_col[i];

-          }

+          count += bh;

-        shift = log2_bsize_minus_1 + up_available + left_available;

-        expected_dc = ROUND_POWER_OF_TWO(average, shift);

-      } else {

-        expected_dc = 128;

+        expected_dc = (average + (count >> 1)) / count;

-      for (r = 0; r < bsize; r++) {

-        vpx_memset(ypred_ptr, expected_dc, bsize);

+      for (r = 0; r < bh; r++) {

+        vpx_memset(ypred_ptr, expected_dc, bw);

         ypred_ptr += y_stride;

     break;

     case V_PRED:

-      for (r = 0; r < bsize; r++) {

-        memcpy(ypred_ptr, yabove_row, bsize);

+      for (r = 0; r < bh; r++) {

+        vpx_memcpy(ypred_ptr, yabove_row, bw);

         ypred_ptr += y_stride;

       break;

     case H_PRED:

-      for (r = 0; r < bsize; r++) {

-        vpx_memset(ypred_ptr, yleft_col[r], bsize);

+      for (r = 0; r < bh; r++) {

+        vpx_memset(ypred_ptr, yleft_col[r], bw);

         ypred_ptr += y_stride;

       break;

     case TM_PRED:

-      for (r = 0; r < bsize; r++) {

-        for (c = 0; c < bsize; c++) {

+      for (r = 0; r < bh; r++) {

+        for (c = 0; c < bw; c++)

           ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);

-        }

         ypred_ptr += y_stride;

       break;

     case D45_PRED:

-      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+      d45_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);

       break;

     case D135_PRED:

-      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+      d135_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);

       break;

     case D117_PRED:

-      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+      d117_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);

       break;

     case D153_PRED:

-      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+      d153_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);

       break;

     case D27_PRED:

-      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+      d27_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);

       break;

     case D63_PRED:

-      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+      d63_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);

       break;

-    case I8X8_PRED:

-    case B_PRED:

-    case NEARESTMV:

-    case NEARMV:

-    case ZEROMV:

-    case NEWMV:

-    case SPLITMV:

-    case MB_MODE_COUNT:

-      break;

-  }

-}

-#if CONFIG_COMP_INTERINTRA_PRED

-static void combine_interintra(MB_PREDICTION_MODE mode,

-                               uint8_t *interpred,

-                               int interstride,

-                               uint8_t *intrapred,

-                               int intrastride,

-                               int size) {

-  // TODO(debargha): Explore different ways of combining predictors

-  //                 or designing the tables below

-  static const int scale_bits = 8;

-  static const int scale_max = 256;     // 1 << scale_bits;

-  static const int scale_round = 127;   // (1 << (scale_bits - 1));

-  // This table is a function A + B*exp(-kx), where x is hor. index

-  static const int weights1d[64] = {

-    128, 125, 122, 119, 116, 114, 111, 109,

-    107, 105, 103, 101,  99,  97,  96,  94,

-     93,  91,  90,  89,  88,  86,  85,  84,

-     83,  82,  81,  81,  80,  79,  78,  78,

-     77,  76,  76,  75,  75,  74,  74,  73,

-     73,  72,  72,  71,  71,  71,  70,  70,

-     70,  70,  69,  69,  69,  69,  68,  68,

-     68,  68,  68,  67,  67,  67,  67,  67,

-  };

-  int size_scale = (size >= 64 ? 1:

-                    size == 32 ? 2 :

-                    size == 16 ? 4 :

-                    size == 8  ? 8 : 16);

-  int i, j;

-  switch (mode) {

-    case V_PRED:

-      for (i = 0; i < size; ++i) {

-        for (j = 0; j < size; ++j) {

-          int k = i * interstride + j;

-          int scale = weights1d[i * size_scale];

-          interpred[k] =

-              ((scale_max - scale) * interpred[k] +

-               scale * intrapred[i * intrastride + j] + scale_round)

-              >> scale_bits;

-        }

-      }

-      break;

-    case H_PRED:

-      for (i = 0; i < size; ++i) {

-        for (j = 0; j < size; ++j) {

-          int k = i * interstride + j;

-          int scale = weights1d[j * size_scale];

-          interpred[k] =

-              ((scale_max - scale) * interpred[k] +

-               scale * intrapred[i * intrastride + j] + scale_round)

-              >> scale_bits;

-        }

-      }

-      break;

-    case D63_PRED:

-    case D117_PRED:

-      for (i = 0; i < size; ++i) {

-        for (j = 0; j < size; ++j) {

-          int k = i * interstride + j;

-          int scale = (weights1d[i * size_scale] * 3 +

-                       weights1d[j * size_scale]) >> 2;

-          interpred[k] =

-              ((scale_max - scale) * interpred[k] +

-               scale * intrapred[i * intrastride + j] + scale_round)

-              >> scale_bits;

-        }

-      }

-      break;

-    case D27_PRED:

-    case D153_PRED:

-      for (i = 0; i < size; ++i) {

-        for (j = 0; j < size; ++j) {

-          int k = i * interstride + j;

-          int scale = (weights1d[j * size_scale] * 3 +

-                       weights1d[i * size_scale]) >> 2;

-          interpred[k] =

-              ((scale_max - scale) * interpred[k] +

-               scale * intrapred[i * intrastride + j] + scale_round)

-              >> scale_bits;

-        }

-      }

-      break;

-    case D135_PRED:

-      for (i = 0; i < size; ++i) {

-        for (j = 0; j < size; ++j) {

-          int k = i * interstride + j;

-          int scale = weights1d[(i < j ? i : j) * size_scale];

-          interpred[k] =

-              ((scale_max - scale) * interpred[k] +

-               scale * intrapred[i * intrastride + j] + scale_round)

-              >> scale_bits;

-        }

-      }

-      break;

-    case D45_PRED:

-      for (i = 0; i < size; ++i) {

-        for (j = 0; j < size; ++j) {

-          int k = i * interstride + j;

-          int scale = (weights1d[i * size_scale] +

-                       weights1d[j * size_scale]) >> 1;

-          interpred[k] =

-              ((scale_max - scale) * interpred[k] +

-               scale * intrapred[i * intrastride + j] + scale_round)

-              >> scale_bits;

-        }

-      }

-      break;

-    case TM_PRED:

-    case DC_PRED:

     default:

-      // simple average

-      for (i = 0; i < size; ++i) {

-        for (j = 0; j < size; ++j) {

-          int k = i * interstride + j;

-          interpred[k] = (interpred[k] + intrapred[i * intrastride + j]) >> 1;

-        }

-      }

       break;

-void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,

-                                              uint8_t *ypred,

-                                              uint8_t *upred,

-                                              uint8_t *vpred,

-                                              int ystride, int uvstride) {

-  vp9_build_interintra_16x16_predictors_mby(xd, ypred, ystride);

-  vp9_build_interintra_16x16_predictors_mbuv(xd, upred, vpred, uvstride);

+void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd,

+                                      BLOCK_SIZE_TYPE bsize) {

+  const struct macroblockd_plane* const pd = &xd->plane[0];

+  const int bw = plane_block_width(bsize, pd);

+  const int bh = plane_block_height(bsize, pd);

+  vp9_build_intra_predictors(pd->dst.buf, pd->dst.stride,

+                             pd->dst.buf, pd->dst.stride,

+                             xd->mode_info_context->mbmi.mode,

+                             bw, bh, xd->up_available, xd->left_available,

+                             0 /*xd->right_available*/);

-void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,

-                                               uint8_t *ypred,

-                                               int ystride) {

-  uint8_t intrapredictor[256];

-  vp9_build_intra_predictors_internal(

-      xd->dst.y_buffer, xd->dst.y_stride,

-      intrapredictor, 16,

-      xd->mode_info_context->mbmi.interintra_mode, 16,

-      xd->up_available, xd->left_available, xd->right_available);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,

-                     ypred, ystride, intrapredictor, 16, 16);

-}

+void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd,

+                                       BLOCK_SIZE_TYPE bsize) {

+  const int bwl = b_width_log2(bsize), bw = 2 << bwl;

+  const int bhl = b_height_log2(bsize), bh = 2 << bhl;

-void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                                uint8_t *upred,

-                                                uint8_t *vpred,

-                                                int uvstride) {

-  uint8_t uintrapredictor[64];

-  uint8_t vintrapredictor[64];

-  vp9_build_intra_predictors_internal(

-      xd->dst.u_buffer, xd->dst.uv_stride,

-      uintrapredictor, 8,

-      xd->mode_info_context->mbmi.interintra_uv_mode, 8,

-      xd->up_available, xd->left_available, xd->right_available);

-  vp9_build_intra_predictors_internal(

-      xd->dst.v_buffer, xd->dst.uv_stride,

-      vintrapredictor, 8,

-      xd->mode_info_context->mbmi.interintra_uv_mode, 8,

-      xd->up_available, xd->left_available, xd->right_available);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

-                     upred, uvstride, uintrapredictor, 8, 8);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

-                     vpred, uvstride, vintrapredictor, 8, 8);

+  vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride,

+                             xd->plane[1].dst.buf, xd->plane[1].dst.stride,

+                             xd->mode_info_context->mbmi.uv_mode,

+                             bw, bh, xd->up_available,

+                             xd->left_available, 0 /*xd->right_available*/);

+  vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride,

+                             xd->plane[2].dst.buf, xd->plane[1].dst.stride,

+                             xd->mode_info_context->mbmi.uv_mode,

+                             bw, bh, xd->up_available,

+                             xd->left_available, 0 /*xd->right_available*/);

-void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd,

-                                               uint8_t *ypred,

-                                               int ystride) {

-  uint8_t intrapredictor[1024];

-  vp9_build_intra_predictors_internal(

-      xd->dst.y_buffer, xd->dst.y_stride,

-      intrapredictor, 32,

-      xd->mode_info_context->mbmi.interintra_mode, 32,

-      xd->up_available, xd->left_available, xd->right_available);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,

-                     ypred, ystride, intrapredictor, 32, 32);

-}

+void vp9_predict_intra_block(MACROBLOCKD *xd,

+                            int block_idx,

+                            int bwl_in,

+                            TX_SIZE tx_size,

+                            int mode,

+                            uint8_t *predictor, int pre_stride) {

+  const int bwl = bwl_in - tx_size;

+  const int wmask = (1 << bwl) - 1;

+  const int have_top = (block_idx >> bwl) || xd->up_available;

+  const int have_left = (block_idx & wmask) || xd->left_available;

+  const int have_right = ((block_idx & wmask) != wmask);

+  const int txfm_block_size = 4 << tx_size;

-void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd,

-                                                uint8_t *upred,

-                                                uint8_t *vpred,

-                                                int uvstride) {

-  uint8_t uintrapredictor[256];

-  uint8_t vintrapredictor[256];

-  vp9_build_intra_predictors_internal(

-      xd->dst.u_buffer, xd->dst.uv_stride,

-      uintrapredictor, 16,

-      xd->mode_info_context->mbmi.interintra_uv_mode, 16,

-      xd->up_available, xd->left_available, xd->right_available);

-  vp9_build_intra_predictors_internal(

-      xd->dst.v_buffer, xd->dst.uv_stride,

-      vintrapredictor, 16,

-      xd->mode_info_context->mbmi.interintra_uv_mode, 16,

-      xd->up_available, xd->left_available, xd->right_available);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

-                     upred, uvstride, uintrapredictor, 16, 16);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

-                     vpred, uvstride, vintrapredictor, 16, 16);

+  assert(bwl >= 0);

+  vp9_build_intra_predictors(predictor, pre_stride,

+                             predictor, pre_stride,

+                             mode,

+                             txfm_block_size,

+                             txfm_block_size,

+                             have_top, have_left,

+                             have_right);

-void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,

-                                              uint8_t *ypred,

-                                              uint8_t *upred,

-                                              uint8_t *vpred,

-                                              int ystride,

-                                              int uvstride) {

-  vp9_build_interintra_32x32_predictors_sby(xd, ypred, ystride);

-  vp9_build_interintra_32x32_predictors_sbuv(xd, upred, vpred, uvstride);

-}

-void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd,

-                                               uint8_t *ypred,

-                                               int ystride) {

-  uint8_t intrapredictor[4096];

-  const int mode = xd->mode_info_context->mbmi.interintra_mode;

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      intrapredictor, 64, mode, 64,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,

-                     ypred, ystride, intrapredictor, 64, 64);

-}

-void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd,

-                                                uint8_t *upred,

-                                                uint8_t *vpred,

-                                                int uvstride) {

-  uint8_t uintrapredictor[1024];

-  uint8_t vintrapredictor[1024];

-  const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;

-  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,

-                                      uintrapredictor, 32, mode, 32,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,

-                                      vintrapredictor, 32, mode, 32,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

-                     upred, uvstride, uintrapredictor, 32, 32);

-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

-                     vpred, uvstride, vintrapredictor, 32, 32);

-}

-void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,

-                                              uint8_t *ypred,

-                                              uint8_t *upred,

-                                              uint8_t *vpred,

-                                              int ystride,

-                                              int uvstride) {

-  vp9_build_interintra_64x64_predictors_sby(xd, ypred, ystride);

-  vp9_build_interintra_64x64_predictors_sbuv(xd, upred, vpred, uvstride);

-}

-#endif  // CONFIG_COMP_INTERINTRA_PRED

-void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->predictor, 16,

-                                      xd->mode_info_context->mbmi.mode, 16,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-}

-void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->mode_info_context->mbmi.mode, 16,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-}

-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->mode_info_context->mbmi.mode, 32,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-}

-void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->mode_info_context->mbmi.mode, 64,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-}

-void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,

-                                              uint8_t *upred_ptr,

-                                              uint8_t *vpred_ptr,

-                                              int uv_stride,

-                                              int mode, int bsize) {

-  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,

-                                      upred_ptr, uv_stride, mode, bsize,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,

-                                      vpred_ptr, uv_stride, mode, bsize,

-                                      xd->up_available, xd->left_available,

-                                      xd->right_available);

-}

-void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],

-                                           &xd->predictor[320], 8,

-                                           xd->mode_info_context->mbmi.uv_mode,

-                                           8);

-}

-void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,

-                                           xd->dst.v_buffer,

-                                           xd->dst.uv_stride,

-                                           xd->mode_info_context->mbmi.uv_mode,

-                                           8);

-}

-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,

-                                           xd->dst.v_buffer, xd->dst.uv_stride,

-                                           xd->mode_info_context->mbmi.uv_mode,

-                                           16);

-}

-void vp9_build_intra_predictors_sb64uv_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,

-                                           xd->dst.v_buffer, xd->dst.uv_stride,

-                                           xd->mode_info_context->mbmi.uv_mode,

-                                           32);

-}

-void vp9_intra8x8_predict(MACROBLOCKD *xd,

-                          BLOCKD *b,

+void vp9_intra4x4_predict(MACROBLOCKD *xd,

+                          int block_idx,

+                          BLOCK_SIZE_TYPE bsize,

                           int mode,

-                          uint8_t *predictor) {

-  const int block4x4_idx = (b - xd->block);

-  const int block_idx = (block4x4_idx >> 2) | !!(block4x4_idx & 2);

-  const int have_top = (block_idx >> 1) || xd->up_available;

-  const int have_left = (block_idx & 1)  || xd->left_available;

-  const int have_right = !(block_idx & 1) || xd->right_available;

-  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,

-                                      b->dst_stride, predictor, 16,

-                                      mode, 8, have_top, have_left,

-                                      have_right);

+                          uint8_t *predictor, int pre_stride) {

+  vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4,

+                          mode, predictor, pre_stride);

-void vp9_intra_uv4x4_predict(MACROBLOCKD *xd,

-                             BLOCKD *b,

-                             int mode,

-                             uint8_t *predictor) {

-  const int block_idx = (b - xd->block) & 3;

-  const int have_top = (block_idx >> 1) || xd->up_available;

-  const int have_left = (block_idx & 1)  || xd->left_available;

-  const int have_right = !(block_idx & 1) || xd->right_available;

-  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,

-                                      b->dst_stride, predictor, 8,

-                                      mode, 4, have_top, have_left,

-                                      have_right);

-}

-/* TODO: try different ways of use Y-UV mode correlation

-   Current code assumes that a uv 4x4 block use same mode

-   as corresponding Y 8x8 area

-   */

--- a/vp9/common/vp9_reconintra.h

+++ b/vp9/common/vp9_reconintra.h

@@ -14,44 +14,17 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_blockd.h"

-void vp9_recon_intra_mbuv(MACROBLOCKD *xd);

+MB_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

+                                               int stride, int n,

+                                               int tx, int ty);

-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                              int stride, int n,

-                                              int tx, int ty);

+MB_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block,

+                                          uint8_t *ptr, int stride);

-B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x);

-#if CONFIG_COMP_INTERINTRA_PRED

-void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,

-                                              uint8_t *ypred,

-                                              uint8_t *upred,

-                                              uint8_t *vpred,

-                                              int ystride,

-                                              int uvstride);

-void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,

-                                               uint8_t *ypred,

-                                               int ystride);

-void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                                uint8_t *upred,

-                                                uint8_t *vpred,

-                                                int uvstride);

-#endif  // CONFIG_COMP_INTERINTRA_PRED

-void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,

-                                              uint8_t *ypred,

-                                              uint8_t *upred,

-                                              uint8_t *vpred,

-                                              int ystride,

-                                              int uvstride);

-void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,

-                                              uint8_t *ypred,

-                                              uint8_t *upred,

-                                              uint8_t *vpred,

-                                              int ystride,

-                                              int uvstride);

+void vp9_predict_intra_block(MACROBLOCKD *xd,

+                            int block_idx,

+                            int bwl_in,

+                            TX_SIZE tx_size,

+                            int mode,

+                            uint8_t *predictor, int pre_stride);

 #endif  // VP9_COMMON_VP9_RECONINTRA_H_

--- a/vp9/common/vp9_reconintra4x4.c

+++ /dev/null

@@ -1,503 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_reconintra.h"

-#include "vp9_rtcd.h"

-#if CONFIG_NEWBINTRAMODES

-static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty,

-                             int dx, int dy) {

-  int i, j;

-  int count = 0, gsum = 0, gdiv;

-  /* TODO: Make this code more efficient by breaking up into two loops */

-  for (i = -ty; i < n; ++i)

-    for (j = -tx; j < n; ++j) {

-      int g;

-      if (i >= 0 && j >= 0) continue;

-      if (i + dy >= 0 && j + dx >= 0) continue;

-      if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue;

-      g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]);

-      gsum += g * g;

-      count++;

-    }

-  gdiv = (dx * dx + dy * dy) * count;

-  return ((gsum << 8) + (gdiv >> 1)) / gdiv;

-}

-#if CONTEXT_PRED_REPLACEMENTS == 6

-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                              int stride, int n,

-                                              int tx, int ty) {

-  int g[8], i, imin, imax;

-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);

-  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);

-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);

-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);

-  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);

-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);

-  imin = 1;

-  for (i = 2; i < 8; i += 1 + (i == 3))

-    imin = (g[i] < g[imin] ? i : imin);

-  imax = 1;

-  for (i = 2; i < 8; i += 1 + (i == 3))

-    imax = (g[i] > g[imax] ? i : imax);

-  /*

-  printf("%d %d %d %d %d %d = %d %d\n",

-         g[1], g[2], g[3], g[5], g[6], g[7], imin, imax);

-         */

-  switch (imin) {

-    case 1:

-      return B_HD_PRED;

-    case 2:

-      return B_RD_PRED;

-    case 3:

-      return B_VR_PRED;

-    case 5:

-      return B_VL_PRED;

-    case 6:

-      return B_LD_PRED;

-    case 7:

-      return B_HU_PRED;

-    default:

-      assert(0);

-  }

-}

-#elif CONTEXT_PRED_REPLACEMENTS == 4

-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                              int stride, int n,

-                                              int tx, int ty) {

-  int g[8], i, imin, imax;

-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);

-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);

-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);

-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);

-  imin = 1;

-  for (i = 3; i < 8; i+=2)

-    imin = (g[i] < g[imin] ? i : imin);

-  imax = 1;

-  for (i = 3; i < 8; i+=2)

-    imax = (g[i] > g[imax] ? i : imax);

-  /*

-  printf("%d %d %d %d = %d %d\n",

-         g[1], g[3], g[5], g[7], imin, imax);

-         */

-  switch (imin) {

-    case 1:

-      return B_HD_PRED;

-    case 3:

-      return B_VR_PRED;

-    case 5:

-      return B_VL_PRED;

-    case 7:

-      return B_HU_PRED;

-    default:

-      assert(0);

-  }

-}

-#elif CONTEXT_PRED_REPLACEMENTS == 0

-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                              int stride, int n,

-                                              int tx, int ty) {

-  int g[8], i, imin, imax;

-  g[0] = find_grad_measure(ptr, stride, n, tx, ty,  1, 0);

-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);

-  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);

-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);

-  g[4] = find_grad_measure(ptr, stride, n, tx, ty,  0, 1);

-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);

-  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);

-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);

-  imax = 0;

-  for (i = 1; i < 8; i++)

-    imax = (g[i] > g[imax] ? i : imax);

-  imin = 0;

-  for (i = 1; i < 8; i++)

-    imin = (g[i] < g[imin] ? i : imin);

-  switch (imin) {

-    case 0:

-      return B_HE_PRED;

-    case 1:

-      return B_HD_PRED;

-    case 2:

-      return B_RD_PRED;

-    case 3:

-      return B_VR_PRED;

-    case 4:

-      return B_VE_PRED;

-    case 5:

-      return B_VL_PRED;

-    case 6:

-      return B_LD_PRED;

-    case 7:

-      return B_HU_PRED;

-    default:

-      assert(0);

-  }

-}

-#endif

-B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) {

-  const int block_idx = x - xd->block;

-  const int have_top = (block_idx >> 2) || xd->up_available;

-  const int have_left = (block_idx & 3)  || xd->left_available;

-  uint8_t *ptr = *(x->base_dst) + x->dst;

-  int stride = x->dst_stride;

-  int tx = have_left ? 4 : 0;

-  int ty = have_top ? 4 : 0;

-  if (!have_left && !have_top)

-    return B_DC_PRED;

-  return vp9_find_dominant_direction(ptr, stride, 4, tx, ty);

-}

-#endif

-void vp9_intra4x4_predict(MACROBLOCKD *xd,

-                          BLOCKD *x,

-                          int b_mode,

-                          uint8_t *predictor) {

-  int i, r, c;

-  const int block_idx = x - xd->block;

-  const int have_top = (block_idx >> 2) || xd->up_available;

-  const int have_left = (block_idx & 3)  || xd->left_available;

-  const int have_right = (block_idx & 3) != 3 || xd->right_available;

-  uint8_t left[4], above[8], top_left;

-  /*

-   * 127 127 127 .. 127 127 127 127 127 127

-   * 129  A   B  ..  Y   Z

-   * 129  C   D  ..  W   X

-   * 129  E   F  ..  U   V

-   * 129  G   H  ..  S   T   T   T   T   T

-   *  ..

-   */

-  if (have_left) {

-    uint8_t *left_ptr = *(x->base_dst) + x->dst - 1;

-    const int stride = x->dst_stride;

-    left[0] = left_ptr[0 * stride];

-    left[1] = left_ptr[1 * stride];

-    left[2] = left_ptr[2 * stride];

-    left[3] = left_ptr[3 * stride];

-  } else {

-    left[0] = left[1] = left[2] = left[3] = 129;

-  }

-  if (have_top) {

-    uint8_t *above_ptr = *(x->base_dst) + x->dst - x->dst_stride;

-    if (have_left) {

-      top_left = above_ptr[-1];

-    } else {

-      top_left = 127;

-    }

-    above[0] = above_ptr[0];

-    above[1] = above_ptr[1];

-    above[2] = above_ptr[2];

-    above[3] = above_ptr[3];

-    if (((block_idx & 3) != 3) ||

-        (have_right && block_idx == 3 &&

-         ((xd->mb_index != 3 && xd->sb_index != 3) ||

-          ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) {

-      above[4] = above_ptr[4];

-      above[5] = above_ptr[5];

-      above[6] = above_ptr[6];

-      above[7] = above_ptr[7];

-    } else if (have_right) {

-      uint8_t *above_right = above_ptr + 4;

-      if (xd->sb_index == 3 && (xd->mb_index & 1))

-        above_right -= 32 * x->dst_stride;

-      if (xd->mb_index == 3)

-        above_right -= 16 * x->dst_stride;

-      above_right -= (block_idx & ~3) * x->dst_stride;

-      /* use a more distant above-right (from closest available top-right

-       * corner), but with a "localized DC" (similar'ish to TM-pred):

-       *

-       *  A   B   C   D   E   F   G   H

-       *  I   J   K   L

-       *  M   N   O   P

-       *  Q   R   S   T

-       *  U   V   W   X   x1  x2  x3  x4

-       *

-       * Where:

-       * x1 = clip_pixel(E + X - D)

-       * x2 = clip_pixel(F + X - D)

-       * x3 = clip_pixel(G + X - D)

-       * x4 = clip_pixel(H + X - D)

-       *

-       * This is applied anytime when we use a "distant" above-right edge

-       * that is not immediately top-right to the block that we're going

-       * to do intra prediction for.

-       */

-      above[4] = clip_pixel(above_right[0] + above_ptr[3] - above_right[-1]);

-      above[5] = clip_pixel(above_right[1] + above_ptr[3] - above_right[-1]);

-      above[6] = clip_pixel(above_right[2] + above_ptr[3] - above_right[-1]);

-      above[7] = clip_pixel(above_right[3] + above_ptr[3] - above_right[-1]);

-    } else {

-      // extend edge

-      above[4] = above[5] = above[6] = above[7] = above[3];

-    }

-  } else {

-    above[0] = above[1] = above[2] = above[3] = 127;

-    above[4] = above[5] = above[6] = above[7] = 127;

-    top_left = 127;

-  }

-#if CONFIG_NEWBINTRAMODES

-  if (b_mode == B_CONTEXT_PRED)

-    b_mode = x->bmi.as_mode.context;

-#endif

-  switch (b_mode) {

-    case B_DC_PRED: {

-      int expected_dc = 0;

-      for (i = 0; i < 4; i++) {

-        expected_dc += above[i];

-        expected_dc += left[i];

-      }

-      expected_dc = (expected_dc + 4) >> 3;

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          predictor[c] = expected_dc;

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_TM_PRED: {

-      /* prediction similar to true_motion prediction */

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          predictor[c] = clip_pixel(above[c] - top_left + left[r]);

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_VE_PRED: {

-      unsigned int ap[4];

-      ap[0] = above[0];

-      ap[1] = above[1];

-      ap[2] = above[2];

-      ap[3] = above[3];

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          predictor[c] = ap[c];

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_HE_PRED: {

-      unsigned int lp[4];

-      lp[0] = left[0];

-      lp[1] = left[1];

-      lp[2] = left[2];

-      lp[3] = left[3];

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          predictor[c] = lp[r];

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_LD_PRED: {

-      uint8_t *ptr = above;

-      predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;

-      predictor[0 * 16 + 1] =

-        predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;

-      predictor[0 * 16 + 2] =

-        predictor[1 * 16 + 1] =

-          predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;

-      predictor[0 * 16 + 3] =

-        predictor[1 * 16 + 2] =

-          predictor[2 * 16 + 1] =

-            predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;

-      predictor[1 * 16 + 3] =

-        predictor[2 * 16 + 2] =

-          predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;

-      predictor[2 * 16 + 3] =

-        predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;

-      predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;

-    }

-    break;

-    case B_RD_PRED: {

-      uint8_t pp[9];

-      pp[0] = left[3];

-      pp[1] = left[2];

-      pp[2] = left[1];

-      pp[3] = left[0];

-      pp[4] = top_left;

-      pp[5] = above[0];

-      pp[6] = above[1];

-      pp[7] = above[2];

-      pp[8] = above[3];

-      predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[3 * 16 + 1] =

-        predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[3 * 16 + 2] =

-        predictor[2 * 16 + 1] =

-          predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[3 * 16 + 3] =

-        predictor[2 * 16 + 2] =

-          predictor[1 * 16 + 1] =

-            predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[2 * 16 + 3] =

-        predictor[1 * 16 + 2] =

-          predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[1 * 16 + 3] =

-        predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-      predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;

-    }

-    break;

-    case B_VR_PRED: {

-      uint8_t pp[9];

-      pp[0] = left[3];

-      pp[1] = left[2];

-      pp[2] = left[1];

-      pp[3] = left[0];

-      pp[4] = top_left;

-      pp[5] = above[0];

-      pp[6] = above[1];

-      pp[7] = above[2];

-      pp[8] = above[3];

-      predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[3 * 16 + 1] =

-        predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[2 * 16 + 1] =

-        predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;

-      predictor[3 * 16 + 2] =

-        predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[2 * 16 + 2] =

-        predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;

-      predictor[3 * 16 + 3] =

-        predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-      predictor[2 * 16 + 3] =

-        predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;

-      predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;

-      predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;

-    }

-    break;

-    case B_VL_PRED: {

-      uint8_t *pp = above;

-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

-      predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[2 * 16 + 0] =

-        predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;

-      predictor[1 * 16 + 1] =

-        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 1] =

-        predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;

-      predictor[3 * 16 + 1] =

-        predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[0 * 16 + 3] =

-        predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;

-      predictor[1 * 16 + 3] =

-        predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-    }

-    break;

-    case B_HD_PRED: {

-      uint8_t pp[9];

-      pp[0] = left[3];

-      pp[1] = left[2];

-      pp[2] = left[1];

-      pp[3] = left[0];

-      pp[4] = top_left;

-      pp[5] = above[0];

-      pp[6] = above[1];

-      pp[7] = above[2];

-      pp[8] = above[3];

-      predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

-      predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[2 * 16 + 0] =

-        predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;

-      predictor[2 * 16 + 1] =

-        predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 2] =

-        predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;

-      predictor[2 * 16 + 3] =

-        predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[1 * 16 + 2] =

-        predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;

-      predictor[1 * 16 + 3] =

-        predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-    }

-    break;

-    case B_HU_PRED: {

-      uint8_t *pp = left;

-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

-      predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[0 * 16 + 2] =

-        predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;

-      predictor[0 * 16 + 3] =

-        predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[1 * 16 + 2] =

-        predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;

-      predictor[1 * 16 + 3] =

-        predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 2] =

-        predictor[2 * 16 + 3] =

-          predictor[3 * 16 + 0] =

-            predictor[3 * 16 + 1] =

-              predictor[3 * 16 + 2] =

-                predictor[3 * 16 + 3] = pp[3];

-    }

-    break;

-#if CONFIG_NEWBINTRAMODES

-    case B_CONTEXT_PRED:

-    break;

-    /*

-    case B_CORNER_PRED:

-    corner_predictor(predictor, 16, 4, above, left);

-    break;

-    */

-#endif

-  }

-}

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -5,14 +5,13 @@

*/

 #include "vpx/vpx_integer.h"

+#include "vp9/common/vp9_enums.h"

 struct loop_filter_info;

-struct blockd;

 struct macroblockd;

 struct loop_filter_info;

 /* Encoder forward decls */

-struct block;

 struct macroblock;

 struct vp9_variance_vtable;

@@ -26,33 +25,27 @@

 # Dequant

-prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"

-specialize vp9_dequant_idct_add_y_block_8x8

+prototype void vp9_idct_add_y_block_8x8 "int16_t *q, uint8_t *dst, int stride, struct macroblockd *xd"

+specialize vp9_idct_add_y_block_8x8

-prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"

-specialize vp9_dequant_idct_add_uv_block_8x8

+prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob"

+specialize vp9_idct_add_16x16

-prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"

-specialize vp9_dequant_idct_add_16x16

+prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob"

+specialize vp9_idct_add_8x8

-prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"

-specialize vp9_dequant_idct_add_8x8

+prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob"

+specialize vp9_idct_add

-prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"

-specialize vp9_dequant_idct_add

+prototype void vp9_idct_add_y_block "int16_t *q, uint8_t *dst, int stride, struct macroblockd *xd"

+specialize vp9_idct_add_y_block

-prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"

-specialize vp9_dequant_idct_add_y_block

+prototype void vp9_idct_add_uv_block "int16_t *q, uint8_t *dst, int stride, uint16_t *eobs"

+specialize vp9_idct_add_uv_block

-prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"

-specialize vp9_dequant_idct_add_uv_block

+prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob"

+specialize vp9_idct_add_32x32

-prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob"

-specialize vp9_dequant_idct_add_32x32

-prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"

-specialize vp9_dequant_idct_add_uv_block_16x16

 # RECON

@@ -67,98 +60,26 @@

 prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

 specialize vp9_copy_mem8x4 mmx

-prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"

-specialize vp9_recon_b

+prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available"

+specialize void vp9_build_intra_predictors

-prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"

-specialize vp9_recon_uv_b

+prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"

+specialize vp9_build_intra_predictors_sby_s

-prototype void vp9_recon2b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"

-specialize vp9_recon2b sse2

+prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"

+specialize vp9_build_intra_predictors_sbuv_s

-prototype void vp9_recon4b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"

-specialize vp9_recon4b sse2

-prototype void vp9_recon_mb "struct macroblockd *x"

-specialize vp9_recon_mb

-prototype void vp9_recon_mby "struct macroblockd *x"

-specialize vp9_recon_mby

-prototype void vp9_recon_mby_s "struct macroblockd *x, uint8_t *dst"

-specialize vp9_recon_mby_s

-prototype void vp9_recon_mbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"

-specialize void vp9_recon_mbuv_s

-prototype void vp9_recon_sby_s "struct macroblockd *x, uint8_t *dst"

-specialize vp9_recon_sby_s

-prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"

-specialize void vp9_recon_sbuv_s

-prototype void vp9_recon_sb64y_s "struct macroblockd *x, uint8_t *dst"

-specialize vp9_recon_sb64y_s

-prototype void vp9_recon_sb64uv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"

-specialize void vp9_recon_sb64uv_s

-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mby_s

-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_sby_s;

-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_sbuv_s;

-prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mby;

-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mby_s;

-prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mbuv;

-prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mbuv_s;

-prototype void vp9_build_intra_predictors_sb64y_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_sb64y_s;

-prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_sb64uv_s;

-prototype void vp9_intra4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"

+prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"

 specialize vp9_intra4x4_predict;

-prototype void vp9_intra8x8_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"

-specialize vp9_intra8x8_predict;

-prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"

-specialize vp9_intra_uv4x4_predict;

 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then

-prototype void vp9_add_residual_4x4 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

-specialize vp9_add_residual_4x4 sse2

-prototype void vp9_add_residual_8x8 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

-specialize vp9_add_residual_8x8 sse2

-prototype void vp9_add_residual_16x16 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

-specialize vp9_add_residual_16x16 sse2

-prototype void vp9_add_residual_32x32 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

-specialize vp9_add_residual_32x32 sse2

-prototype void vp9_add_constant_residual_8x8 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"

 specialize vp9_add_constant_residual_8x8 sse2

-prototype void vp9_add_constant_residual_16x16 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"

 specialize vp9_add_constant_residual_16x16 sse2

-prototype void vp9_add_constant_residual_32x32 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"

 specialize vp9_add_constant_residual_32x32 sse2

fi

@@ -165,54 +86,24 @@

 # Loopfilter

-prototype void vp9_loop_filter_mbv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_mbv sse2

+prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

+specialize vp9_mb_lpf_vertical_edge_w

-prototype void vp9_loop_filter_bv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bv sse2

+prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

+specialize vp9_mbloop_filter_vertical_edge

-prototype void vp9_loop_filter_bv8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bv8x8 sse2

+prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

+specialize vp9_loop_filter_vertical_edge

-prototype void vp9_loop_filter_mbh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_mbh sse2

+prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

+specialize vp9_mb_lpf_horizontal_edge_w

-prototype void vp9_loop_filter_bh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bh sse2

+prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

+specialize vp9_mbloop_filter_horizontal_edge

-prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bh8x8 sse2

+prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"

+specialize vp9_loop_filter_horizontal_edge

-prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"

-specialize vp9_loop_filter_simple_mbv mmx sse2

-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c

-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx

-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2

-prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"

-specialize vp9_loop_filter_simple_mbh mmx sse2

-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c

-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx

-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2

-prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"

-specialize vp9_loop_filter_simple_bv mmx sse2

-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c

-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx

-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2

-prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"

-specialize vp9_loop_filter_simple_bh mmx sse2

-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c

-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx

-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2

-prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_lpf_mbh_w sse2

-prototype void vp9_lpf_mbv_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_lpf_mbv_w sse2

 # post proc

@@ -225,7 +116,7 @@

 specialize vp9_mbpost_proc_across_ip sse2

 vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm

-prototype void vp9_post_proc_down_and_across "uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"

+prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"

 specialize vp9_post_proc_down_and_across mmx sse2

 vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm

@@ -244,18 +135,6 @@

 specialize vp9_blend_b

-# sad 16x3, 3x16

-#

-prototype unsigned int vp9_sad16x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"

-specialize vp9_sad16x3 sse2

-prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"

-specialize vp9_sad3x16 sse2

-prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_sub_pixel_variance16x2 sse2

-#

 # Sub Pixel Filters

 prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

@@ -276,123 +155,64 @@

 prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

 specialize vp9_convolve8_avg_vert ssse3

-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

-prototype void vp9_convolve8_1by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_1by8

-prototype void vp9_convolve8_qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_qtr

-prototype void vp9_convolve8_3by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_3by8

-prototype void vp9_convolve8_5by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_5by8

-prototype void vp9_convolve8_3qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_3qtr

-prototype void vp9_convolve8_7by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_7by8

-prototype void vp9_convolve8_1by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_1by8_horiz

-prototype void vp9_convolve8_qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_qtr_horiz

-prototype void vp9_convolve8_3by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_3by8_horiz

-prototype void vp9_convolve8_5by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_5by8_horiz

-prototype void vp9_convolve8_3qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_3qtr_horiz

-prototype void vp9_convolve8_7by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_7by8_horiz

-prototype void vp9_convolve8_1by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_1by8_vert

-prototype void vp9_convolve8_qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_qtr_vert

-prototype void vp9_convolve8_3by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_3by8_vert

-prototype void vp9_convolve8_5by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_5by8_vert

-prototype void vp9_convolve8_3qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_3qtr_vert

-prototype void vp9_convolve8_7by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

-specialize vp9_convolve8_7by8_vert

-#endif

 # dct

-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4_1

+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct4x4_1_add

-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4 sse2

+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct4x4_add sse2

-prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct8x8 sse2

+prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct8x8_add sse2

-prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct10_8x8 sse2

+prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct10_8x8_add sse2

 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"

 specialize vp9_short_idct1_8x8

-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct16x16 sse2

+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct16x16_add sse2

-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct10_16x16 sse2

+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct10_16x16_add sse2

 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"

 specialize vp9_short_idct1_16x16

+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct32x32_add sse2

-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct32x32 sse2

 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"

 specialize vp9_short_idct1_32x32

-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct10_32x32

+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct10_32x32_add

-prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"

-specialize vp9_short_iht8x8

+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"

+specialize vp9_short_iht4x4_add

-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"

-specialize vp9_short_iht4x4

+prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"

+specialize vp9_short_iht8x8_add

-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"

-specialize vp9_short_iht16x16

+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"

+specialize vp9_short_iht16x16_add

 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"

 specialize vp9_idct4_1d sse2

 # dct and add

 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

 specialize vp9_dc_only_idct_add sse2

-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_iwalsh4x4_1

-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_iwalsh4x4

-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

-specialize vp9_dc_only_inv_walsh_add

+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_iwalsh4x4_1_add

+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_iwalsh4x4_add

 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"

 specialize vp9_sad32x3

@@ -408,66 +228,148 @@

 # variance

 [ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2

+prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance32x16 sse2

+prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance16x32 sse2

+prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance64x32 sse2

+prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance32x64 sse2

 prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance32x32

+specialize vp9_variance32x32 sse2

 prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance64x64

+specialize vp9_variance64x64 sse2

 prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_variance16x16 mmx sse2

-vp9_variance16x16_sse2=vp9_variance16x16_wmt

-vp9_variance16x16_mmx=vp9_variance16x16_mmx

 prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_variance16x8 mmx sse2

-vp9_variance16x8_sse2=vp9_variance16x8_wmt

-vp9_variance16x8_mmx=vp9_variance16x8_mmx

 prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_variance8x16 mmx sse2

-vp9_variance8x16_sse2=vp9_variance8x16_wmt

-vp9_variance8x16_mmx=vp9_variance8x16_mmx

 prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_variance8x8 mmx sse2

-vp9_variance8x8_sse2=vp9_variance8x8_wmt

-vp9_variance8x8_mmx=vp9_variance8x8_mmx

+prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"

+specialize vp9_get_sse_sum_8x8 sse2

+vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2

+prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance8x4 sse2

+prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance4x8 sse2

 prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_variance4x4 mmx sse2

-vp9_variance4x4_sse2=vp9_variance4x4_wmt

-vp9_variance4x4_mmx=vp9_variance4x4_mmx

 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance64x64 sse2

+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance64x64

+prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance32x64

+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance32x64

+prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance64x32

+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance64x32

+prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance32x16

+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance32x16

+prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance16x32

+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance16x32

 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance32x32 sse2

+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance32x32

 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3

+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance16x16

 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance8x16 sse2 mmx

 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance8x16

 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3

 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;

 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance16x8

 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance8x8 sse2 mmx

 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance8x8

+# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form

+prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance8x4

+prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance8x4

+prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance4x8

+prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance4x8

 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance4x4 sse2 mmx

 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"

+specialize vp9_sub_pixel_avg_variance4x4

 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

 specialize vp9_sad64x64 sse2

+prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"

+specialize vp9_sad32x64 sse2

+prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"

+specialize vp9_sad64x32 sse2

+prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"

+specialize vp9_sad32x16 sse2

+prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"

+specialize vp9_sad16x32 sse2

 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

 specialize vp9_sad32x32 sse2

@@ -483,6 +385,13 @@

 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

 specialize vp9_sad8x8 mmx sse2

+# TODO(jingning): need to covert these functions into mmx/sse2 form

+prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"

+specialize vp9_sad8x4

+prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"

+specialize vp9_sad4x8

 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

 specialize vp9_sad4x4 mmx sse

@@ -555,6 +464,12 @@

 prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad8x8x8 sse4

+prototype void vp9_sad8x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"

+specialize vp9_sad8x4x8

+prototype void vp9_sad4x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"

+specialize vp9_sad4x8x8

 prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad4x4x8 sse4

@@ -561,6 +476,18 @@

 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

 specialize vp9_sad64x64x4d sse2

+prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad32x64x4d sse2

+prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad64x32x4d sse2

+prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad32x16x4d sse2

+prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad16x32x4d sse2

 prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

 specialize vp9_sad32x32x4d sse2

@@ -576,6 +503,13 @@

 prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

 specialize vp9_sad8x8x4d sse2

+# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form

+prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"

+specialize vp9_sad8x4x4d

+prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"

+specialize vp9_sad4x8x4d

 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

 specialize vp9_sad4x4x4d sse

 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"

@@ -585,6 +519,15 @@

 specialize vp9_mse16x16 mmx sse2

 vp9_mse16x16_sse2=vp9_mse16x16_wmt

+prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"

+specialize vp9_mse8x16

+prototype unsigned int vp9_mse16x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"

+specialize vp9_mse16x8

+prototype unsigned int vp9_mse8x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"

+specialize vp9_mse8x8

 prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_mse64x64

@@ -594,30 +537,11 @@

 prototype unsigned int vp9_get_mb_ss "const int16_t *"

 specialize vp9_get_mb_ss mmx sse2

 # ENCODEMB INVOKE

-prototype int vp9_mbblock_error "struct macroblock *mb"

-specialize vp9_mbblock_error mmx sse2

-vp9_mbblock_error_sse2=vp9_mbblock_error_xmm

 prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"

 specialize vp9_block_error mmx sse2

 vp9_block_error_sse2=vp9_block_error_xmm

-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"

-specialize vp9_subtract_b mmx sse2

-prototype int vp9_mbuverror "struct macroblock *mb"

-specialize vp9_mbuverror mmx sse2

-vp9_mbuverror_sse2=vp9_mbuverror_xmm

-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"

-specialize vp9_subtract_b mmx sse2

-prototype void vp9_subtract_mby "int16_t *diff, uint8_t *src, uint8_t *pred, int stride"

-specialize vp9_subtract_mby mmx sse2

-prototype void vp9_subtract_mbuv "int16_t *diff, uint8_t *usrc, uint8_t *vsrc, uint8_t *pred, int stride"

-specialize vp9_subtract_mbuv mmx sse2

 # Structured Similarity (SSIM)

@@ -665,16 +589,16 @@

 # Motion search

-prototype int vp9_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"

+prototype int vp9_full_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n"

 specialize vp9_full_search_sad sse3 sse4_1

 vp9_full_search_sad_sse3=vp9_full_search_sadx3

 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8

-prototype int vp9_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"

+prototype int vp9_refining_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"

 specialize vp9_refining_search_sad sse3

 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4

-prototype int vp9_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"

+prototype int vp9_diamond_search_sad "struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"

 specialize vp9_diamond_search_sad sse3

 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4

--- a/vp9/common/vp9_seg_common.c

+++ b/vp9/common/vp9_seg_common.c

@@ -12,8 +12,8 @@

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_seg_common.h"

-static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };

-static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 0xf, 0xf };

+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };

+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 3, 0 };

 // These functions provide access to new segment level features.

 // Eventually these function may be "optimized out" but for the moment,

@@ -20,13 +20,10 @@

 // the coding mechanism is still subject to change so these provide a

 // convenient single point of change.

-int vp9_segfeature_active(const MACROBLOCKD *xd,

-                          int segment_id,

+int vp9_segfeature_active(const MACROBLOCKD *xd, int segment_id,

                           SEG_LVL_FEATURES feature_id) {

-  // Return true if mask bit set and segmentation enabled.

-  return (xd->segmentation_enabled &&

-          (xd->segment_feature_mask[segment_id] &

-           (0x01 << feature_id)));

+  return xd->segmentation_enabled &&

+         (xd->segment_feature_mask[segment_id] & (1 << feature_id));

 void vp9_clearall_segfeatures(MACROBLOCKD *xd) {

@@ -34,14 +31,12 @@

   vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));

-void vp9_enable_segfeature(MACROBLOCKD *xd,

-                           int segment_id,

+void vp9_enable_segfeature(MACROBLOCKD *xd, int segment_id,

                            SEG_LVL_FEATURES feature_id) {

-  xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);

+  xd->segment_feature_mask[segment_id] |= 1 << feature_id;

-void vp9_disable_segfeature(MACROBLOCKD *xd,

-                            int segment_id,

+void vp9_disable_segfeature(MACROBLOCKD *xd, int segment_id,

                             SEG_LVL_FEATURES feature_id) {

   xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);

@@ -51,22 +46,19 @@

 int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {

-  return segfeaturedata_signed[feature_id];

+  return seg_feature_data_signed[feature_id];

-void vp9_clear_segdata(MACROBLOCKD *xd,

-                       int segment_id,

+void vp9_clear_segdata(MACROBLOCKD *xd, int segment_id,

                        SEG_LVL_FEATURES feature_id) {

   xd->segment_feature_data[segment_id][feature_id] = 0;

-void vp9_set_segdata(MACROBLOCKD *xd,

-                     int segment_id,

-                     SEG_LVL_FEATURES feature_id,

-                     int seg_data) {

+void vp9_set_segdata(MACROBLOCKD *xd, int segment_id,

+                     SEG_LVL_FEATURES feature_id, int seg_data) {

   assert(seg_data <= seg_feature_data_max[feature_id]);

   if (seg_data < 0) {

-    assert(segfeaturedata_signed[feature_id]);

+    assert(seg_feature_data_signed[feature_id]);

     assert(-seg_data <= seg_feature_data_max[feature_id]);

@@ -73,33 +65,16 @@

   xd->segment_feature_data[segment_id][feature_id] = seg_data;

-int vp9_get_segdata(const MACROBLOCKD *xd,

-                    int segment_id,

+int vp9_get_segdata(const MACROBLOCKD *xd, int segment_id,

                     SEG_LVL_FEATURES feature_id) {

   return xd->segment_feature_data[segment_id][feature_id];

-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {

-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;

-}

-void vp9_set_segref(MACROBLOCKD *xd,

-                    int segment_id,

-                    MV_REFERENCE_FRAME ref_frame) {

-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=

-    (1 << ref_frame);

-}

+const vp9_tree_index vp9_segment_tree[14] = {

+  2,  4,  6,  8, 10, 12,

+  0, -1, -2, -3, -4, -5, -6, -7

+};

-int vp9_check_segref(const MACROBLOCKD *xd,

-                     int segment_id,

-                     MV_REFERENCE_FRAME ref_frame) {

-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &

-          (1 << ref_frame)) ? 1 : 0;

-}

-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {

-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &

-          ~(1 << INTRA_FRAME)) ? 1 : 0;

-}

 // TBD? Functions to read and write segment data with range / validity checking

--- a/vp9/common/vp9_seg_common.h

+++ b/vp9/common/vp9_seg_common.h

@@ -45,17 +45,7 @@

                     int segment_id,

                     SEG_LVL_FEATURES feature_id);

-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);

-void vp9_set_segref(MACROBLOCKD *xd,

-                    int segment_id,

-                    MV_REFERENCE_FRAME ref_frame);

-int vp9_check_segref(const MACROBLOCKD *xd,

-                     int segment_id,

-                     MV_REFERENCE_FRAME ref_frame);

-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);

+extern const vp9_tree_index vp9_segment_tree[14];

 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_

--- a/vp9/common/vp9_setupintrarecon.c

+++ /dev/null

@@ -1,31 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9/common/vp9_setupintrarecon.h"

-#include "vpx_mem/vpx_mem.h"

-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {

-  int i;

-  /* set up frame new frame for intra coded blocks */

-  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);

-  for (i = 0; i < ybf->y_height; i++)

-    ybf->y_buffer[ybf->y_stride * i - 1] = (uint8_t) 129;

-  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);

-  for (i = 0; i < ybf->uv_height; i++)

-    ybf->u_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;

-  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);

-  for (i = 0; i < ybf->uv_height; i++)

-    ybf->v_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;

-}

--- a/vp9/common/vp9_setupintrarecon.h

+++ /dev/null

@@ -1,18 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_VP9_SETUPINTRARECON_H_

-#define VP9_COMMON_VP9_SETUPINTRARECON_H_

-#include "vpx_scale/yv12config.h"

-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);

-#endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_

--- a/vp9/common/vp9_swapyv12buffer.c

+++ /dev/null

@@ -1,32 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9/common/vp9_swapyv12buffer.h"

-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,

-                          YV12_BUFFER_CONFIG *last_frame) {

-  uint8_t *temp;

-  temp = last_frame->buffer_alloc;

-  last_frame->buffer_alloc = new_frame->buffer_alloc;

-  new_frame->buffer_alloc = temp;

-  temp = last_frame->y_buffer;

-  last_frame->y_buffer = new_frame->y_buffer;

-  new_frame->y_buffer = temp;

-  temp = last_frame->u_buffer;

-  last_frame->u_buffer = new_frame->u_buffer;

-  new_frame->u_buffer = temp;

-  temp = last_frame->v_buffer;

-  last_frame->v_buffer = new_frame->v_buffer;

-  new_frame->v_buffer = temp;

-}

--- a/vp9/common/vp9_swapyv12buffer.h

+++ /dev/null

@@ -1,19 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_VP9_SWAPYV12BUFFER_H_

-#define VP9_COMMON_VP9_SWAPYV12BUFFER_H_

-#include "vpx_scale/yv12config.h"

-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,

-                          YV12_BUFFER_CONFIG *last_frame);

-#endif  // VP9_COMMON_VP9_SWAPYV12BUFFER_H_

--- a/vp9/common/vp9_tile_common.c

+++ b/vp9/common/vp9_tile_common.c

@@ -17,27 +17,27 @@

 static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,

                                  int *max_tile_off, int tile_idx,

-                                 int log2_n_tiles, int n_mbs) {

-  const int n_sbs = (n_mbs + 3) >> 2;

+                                 int log2_n_tiles, int n_mis) {

+  const int n_sbs = (n_mis + 7) >> 3;

   const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;

   const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;

-  *min_tile_off = MIN(sb_off1 << 2, n_mbs);

-  *max_tile_off = MIN(sb_off2 << 2, n_mbs);

+  *min_tile_off = MIN(sb_off1 << 3, n_mis);

+  *max_tile_off = MIN(sb_off2 << 3, n_mis);

 void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {

   cm->cur_tile_col_idx = tile_col_idx;

-  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_col_start,

-                       &cm->cur_tile_mb_col_end, tile_col_idx,

-                       cm->log2_tile_columns, cm->mb_cols);

+  vp9_get_tile_offsets(cm, &cm->cur_tile_mi_col_start,

+                       &cm->cur_tile_mi_col_end, tile_col_idx,

+                       cm->log2_tile_columns, cm->mi_cols);

 void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {

   cm->cur_tile_row_idx = tile_row_idx;

-  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_row_start,

-                       &cm->cur_tile_mb_row_end, tile_row_idx,

-                       cm->log2_tile_rows, cm->mb_rows);

+  vp9_get_tile_offsets(cm, &cm->cur_tile_mi_row_start,

+                       &cm->cur_tile_mi_row_end, tile_row_idx,

+                       cm->log2_tile_rows, cm->mi_rows);

@@ -49,10 +49,15 @@

   for (max_log2_n_tiles = 0;

        (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS;

        max_log2_n_tiles++) {}

+  max_log2_n_tiles--;

+  if (max_log2_n_tiles <  0)

+    max_log2_n_tiles = 0;

   for (min_log2_n_tiles = 0;

        (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols;

        min_log2_n_tiles++) {}

+  assert(max_log2_n_tiles >= min_log2_n_tiles);

   *min_log2_n_tiles_ptr = min_log2_n_tiles;

   *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles;

--- a/vp9/common/vp9_treecoder.c

+++ b/vp9/common/vp9_treecoder.c

@@ -14,19 +14,13 @@

 #if defined(CONFIG_DEBUG) && CONFIG_DEBUG

 #include <assert.h>

 #endif

-#include <stdio.h>

 #include "vp9/common/vp9_treecoder.h"

-static void tree2tok(

-  struct vp9_token_struct *const p,

-  vp9_tree t,

-  int i,

-  int v,

-  int L

-) {

+static void tree2tok(struct vp9_token *const p, vp9_tree t,

+                    int i, int v, int l) {

   v += v;

-  ++L;

+  ++l;

   do {

     const vp9_tree_index j = t[i++];

@@ -33,17 +27,17 @@

     if (j <= 0) {

       p[-j].value = v;

-      p[-j].Len = L;

+      p[-j].len = l;

     } else

-      tree2tok(p, t, j, v, L);

+      tree2tok(p, t, j, v, l);

   } while (++v & 1);

-void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {

+void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) {

   tree2tok(p, t, 0, 0, 0);

-void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,

+void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,

                                  int offset) {

   tree2tok(p - offset, t, 0, 0, 0);

@@ -62,12 +56,12 @@

     left = convert_distribution(tree[i], tree, probs, branch_ct,

                                 num_events, tok0_offset);

-  if (tree[i + 1] <= 0) {

+  if (tree[i + 1] <= 0)

     right = num_events[-tree[i + 1] - tok0_offset];

-  } else {

+  else

     right = convert_distribution(tree[i + 1], tree, probs, branch_ct,

-                                num_events, tok0_offset);

-  }

+                                 num_events, tok0_offset);

   probs[i>>1] = get_binary_prob(left, right);

   branch_ct[i>>1][0] = left;

   branch_ct[i>>1][1] = right;

--- a/vp9/common/vp9_treecoder.h

+++ b/vp9/common/vp9_treecoder.h

@@ -13,6 +13,7 @@

 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

+#include "vp9/common/vp9_common.h"

 typedef uint8_t vp9_prob;

@@ -31,16 +32,15 @@

 typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;

-typedef const struct vp9_token_struct {

+struct vp9_token {

   int value;

-  int Len;

-} vp9_token;

+  int len;

+};

 /* Construct encoding array from tree. */

-void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);

-void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,

-                                 int offset);

+void vp9_tokens_from_tree(struct vp9_token*, vp9_tree);

+void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);

 /* Convert array of token occurrence counts into a table of probabilities

    for the associated binary encoding tree.  Also writes count of branches

@@ -76,7 +76,7 @@

 /* this function assumes prob1 and prob2 are already within [1,255] range */

 static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {

-  return (prob1 * (256 - factor) + prob2 * factor + 128) >> 8;

+  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);

 #endif  // VP9_COMMON_VP9_TREECODER_H_

--- a/vp9/common/x86/vp9_asm_stubs.c

+++ b/vp9/common/x86/vp9_asm_stubs.c

@@ -278,45 +278,20 @@

                          const int16_t *filter_x, int x_step_q4,

                          const int16_t *filter_y, int y_step_q4,

                          int w, int h) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);

-  // check w/h due to fixed size fdata2 array

-  assert(w <= 16);

-  assert(h <= 16);

-  if (x_step_q4 == 16 && y_step_q4 == 16 &&

-      filter_x[3] != 128 && filter_y[3] != 128) {

-    if (w == 16) {

-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,

-                                    fdata2, 16,

-                                    h + 7, filter_x);

-      vp9_filter_block1d16_v8_ssse3(fdata2, 16,

-                                    dst, dst_stride,

-                                    h, filter_y);

-      return;

-    }

-    if (w == 8) {

-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,

-                                   fdata2, 16,

-                                   h + 7, filter_x);

-      vp9_filter_block1d8_v8_ssse3(fdata2, 16,

-                                   dst, dst_stride,

-                                   h, filter_y);

-      return;

-    }

-    if (w == 4) {

-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,

-                                   fdata2, 16,

-                                   h + 7, filter_x);

-      vp9_filter_block1d4_v8_ssse3(fdata2, 16,

-                                   dst, dst_stride,

-                                   h, filter_y);

-      return;

-    }

+  assert(w <= 64);

+  assert(h <= 64);

+  if (x_step_q4 == 16 && y_step_q4 == 16) {

+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,

+                              filter_x, x_step_q4, filter_y, y_step_q4,

+                              w, h + 7);

+    vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,

+                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);

+  } else {

+    vp9_convolve8_c(src, src_stride, dst, dst_stride,

+                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);

-  vp9_convolve8_c(src, src_stride, dst, dst_stride,

-                  filter_x, x_step_q4, filter_y, y_step_q4,

-                  w, h);

 void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,

@@ -324,44 +299,20 @@

                          const int16_t *filter_x, int x_step_q4,

                          const int16_t *filter_y, int y_step_q4,

                          int w, int h) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);

-  // check w/h due to fixed size fdata2 array

-  assert(w <= 16);

-  assert(h <= 16);

-  if (x_step_q4 == 16 && y_step_q4 == 16 &&

-      filter_x[3] != 128 && filter_y[3] != 128) {

-    if (w == 16) {

-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,

-                                    fdata2, 16,

-                                    h + 7, filter_x);

-      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,

-                                        dst, dst_stride,

-                                        h, filter_y);

-      return;

-    }

-    if (w == 8) {

-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,

-                                   fdata2, 16,

-                                   h + 7, filter_x);

-      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,

-                                       dst, dst_stride,

-                                       h, filter_y);

-      return;

-    }

-    if (w == 4) {

-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,

-                                   fdata2, 16,

-                                   h + 7, filter_x);

-      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,

-                                       dst, dst_stride,

-                                       h, filter_y);

-      return;

-    }

+  assert(w <= 64);

+  assert(h <= 64);

+  if (x_step_q4 == 16 && y_step_q4 == 16) {

+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,

+                              filter_x, x_step_q4, filter_y, y_step_q4,

+                              w, h + 7);

+    vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,

+                                 filter_x, x_step_q4, filter_y, y_step_q4,

+                                 w, h);

+  } else {

+    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,

+                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);

-  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,

-                      filter_x, x_step_q4, filter_y, y_step_q4,

-                      w, h);

 #endif

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -73,7 +73,7 @@

   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i zero = _mm_setzero_si128();

   const __m128i eight = _mm_set1_epi16(8);

   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

@@ -81,7 +81,6 @@

                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const int half_pitch = pitch >> 1;

   __m128i input0, input1, input2, input3;

   // Rows

@@ -188,14 +187,23 @@

   input2 = _mm_srai_epi16(input2, 4);

   input3 = _mm_srai_epi16(input3, 4);

-  // Store results

-  _mm_storel_epi64((__m128i *)output, input2);

-  input2 = _mm_srli_si128(input2, 8);

-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);

+#define RECON_AND_STORE4X4(dest, in_x) \

+  {                                                     \

+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \

+      d0 = _mm_unpacklo_epi8(d0, zero); \

+      d0 = _mm_add_epi16(in_x, d0); \

+      d0 = _mm_packus_epi16(d0, d0); \

+      *(int *)dest = _mm_cvtsi128_si32(d0); \

+      dest += stride; \

+  }

-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);

-  input3 = _mm_srli_si128(input3, 8);

-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);

+  input0 = _mm_srli_si128(input2, 8);

+  input1 = _mm_srli_si128(input3, 8);

+  RECON_AND_STORE4X4(dest, input2);

+  RECON_AND_STORE4X4(dest, input0);

+  RECON_AND_STORE4X4(dest, input1);

+  RECON_AND_STORE4X4(dest, input3);

 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {

@@ -403,8 +411,18 @@

   in6 = _mm_subs_epi16(stp1_1, stp1_6); \

   in7 = _mm_subs_epi16(stp1_0, stp2_7);

-void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+#define RECON_AND_STORE(dest, in_x) \

+  {                                                     \

+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \

+      d0 = _mm_unpacklo_epi8(d0, zero); \

+      in_x = _mm_add_epi16(in_x, d0); \

+      in_x = _mm_packus_epi16(in_x, in_x); \

+      _mm_storel_epi64((__m128i *)(dest), in_x); \

+      dest += stride; \

+  }

+void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {

+  const __m128i zero = _mm_setzero_si128();

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<4);

   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

@@ -461,19 +479,17 @@

   in6 = _mm_srai_epi16(in6, 5);

   in7 = _mm_srai_epi16(in7, 5);

-  // Store results

-  _mm_store_si128((__m128i *)output, in0);

-  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

+  RECON_AND_STORE(dest, in0);

+  RECON_AND_STORE(dest, in1);

+  RECON_AND_STORE(dest, in2);

+  RECON_AND_STORE(dest, in3);

+  RECON_AND_STORE(dest, in4);

+  RECON_AND_STORE(dest, in5);

+  RECON_AND_STORE(dest, in6);

+  RECON_AND_STORE(dest, in7);

-void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i zero = _mm_setzero_si128();

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<4);

@@ -612,15 +628,14 @@

   in6 = _mm_srai_epi16(in6, 5);

   in7 = _mm_srai_epi16(in7, 5);

-  // Store results

-  _mm_store_si128((__m128i *)output, in0);

-  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

+  RECON_AND_STORE(dest, in0);

+  RECON_AND_STORE(dest, in1);

+  RECON_AND_STORE(dest, in2);

+  RECON_AND_STORE(dest, in3);

+  RECON_AND_STORE(dest, in4);

+  RECON_AND_STORE(dest, in5);

+  RECON_AND_STORE(dest, in6);

+  RECON_AND_STORE(dest, in7);

 #define IDCT16x16_1D \

@@ -752,8 +767,7 @@

                            stp2_10, stp2_13, stp2_11, stp2_12) \

-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

   const __m128i zero = _mm_setzero_si128();

@@ -938,31 +952,30 @@

       in14 = _mm_srai_epi16(in14, 6);

       in15 = _mm_srai_epi16(in15, 6);

-      // Store results

-      _mm_store_si128((__m128i *)output, in0);

-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

+      RECON_AND_STORE(dest, in0);

+      RECON_AND_STORE(dest, in1);

+      RECON_AND_STORE(dest, in2);

+      RECON_AND_STORE(dest, in3);

+      RECON_AND_STORE(dest, in4);

+      RECON_AND_STORE(dest, in5);

+      RECON_AND_STORE(dest, in6);

+      RECON_AND_STORE(dest, in7);

+      RECON_AND_STORE(dest, in8);

+      RECON_AND_STORE(dest, in9);

+      RECON_AND_STORE(dest, in10);

+      RECON_AND_STORE(dest, in11);

+      RECON_AND_STORE(dest, in12);

+      RECON_AND_STORE(dest, in13);

+      RECON_AND_STORE(dest, in14);

+      RECON_AND_STORE(dest, in15);

-      output += 8;

+      dest += 8 - (stride * 16);

-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,

+                                     int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

   const __m128i zero = _mm_setzero_si128();

@@ -1007,7 +1020,6 @@

           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;

   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

   int i;

   // 1-D idct. Load input data.

   in0 = _mm_load_si128((__m128i *)input);

   in8 = _mm_load_si128((__m128i *)(input + 8 * 1));

@@ -1298,29 +1310,28 @@

     in14 = _mm_srai_epi16(in14, 6);

     in15 = _mm_srai_epi16(in15, 6);

-    // Store results

-    _mm_store_si128((__m128i *)output, in0);

-    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

-    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

-    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

-    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

-    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

-    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

-    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

-    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

-    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

-    output += 8;

+    RECON_AND_STORE(dest, in0);

+    RECON_AND_STORE(dest, in1);

+    RECON_AND_STORE(dest, in2);

+    RECON_AND_STORE(dest, in3);

+    RECON_AND_STORE(dest, in4);

+    RECON_AND_STORE(dest, in5);

+    RECON_AND_STORE(dest, in6);

+    RECON_AND_STORE(dest, in7);

+    RECON_AND_STORE(dest, in8);

+    RECON_AND_STORE(dest, in9);

+    RECON_AND_STORE(dest, in10);

+    RECON_AND_STORE(dest, in11);

+    RECON_AND_STORE(dest, in12);

+    RECON_AND_STORE(dest, in13);

+    RECON_AND_STORE(dest, in14);

+    RECON_AND_STORE(dest, in15);

+    dest += 8 - (stride * 16);

-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

@@ -1832,6 +1843,8 @@

       col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);

       col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);

     } else {

+      const __m128i zero = _mm_setzero_si128();

       // 2_D: Calculate the results and store them to destination.

       in0 = _mm_add_epi16(stp1_0, stp1_31);

       in1 = _mm_add_epi16(stp1_1, stp1_30);

@@ -1933,41 +1946,40 @@

       in30 = _mm_srai_epi16(in30, 6);

       in31 = _mm_srai_epi16(in31, 6);

-      // Store results

-      _mm_store_si128((__m128i *)output, in0);

-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

-      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);

-      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);

-      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);

-      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);

-      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);

-      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);

-      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);

-      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);

-      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);

-      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);

-      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);

-      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);

-      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);

-      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);

-      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);

-      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);

+      RECON_AND_STORE(dest, in0);

+      RECON_AND_STORE(dest, in1);

+      RECON_AND_STORE(dest, in2);

+      RECON_AND_STORE(dest, in3);

+      RECON_AND_STORE(dest, in4);

+      RECON_AND_STORE(dest, in5);

+      RECON_AND_STORE(dest, in6);

+      RECON_AND_STORE(dest, in7);

+      RECON_AND_STORE(dest, in8);

+      RECON_AND_STORE(dest, in9);

+      RECON_AND_STORE(dest, in10);

+      RECON_AND_STORE(dest, in11);

+      RECON_AND_STORE(dest, in12);

+      RECON_AND_STORE(dest, in13);

+      RECON_AND_STORE(dest, in14);

+      RECON_AND_STORE(dest, in15);

+      RECON_AND_STORE(dest, in16);

+      RECON_AND_STORE(dest, in17);

+      RECON_AND_STORE(dest, in18);

+      RECON_AND_STORE(dest, in19);

+      RECON_AND_STORE(dest, in20);

+      RECON_AND_STORE(dest, in21);

+      RECON_AND_STORE(dest, in22);

+      RECON_AND_STORE(dest, in23);

+      RECON_AND_STORE(dest, in24);

+      RECON_AND_STORE(dest, in25);

+      RECON_AND_STORE(dest, in26);

+      RECON_AND_STORE(dest, in27);

+      RECON_AND_STORE(dest, in28);

+      RECON_AND_STORE(dest, in29);

+      RECON_AND_STORE(dest, in30);

+      RECON_AND_STORE(dest, in31);

-      output += 8;

+      dest += 8 - (stride * 32);

--- a/vp9/common/x86/vp9_idct_sse2.asm

+++ /dev/null

@@ -1,712 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_idct_dequant_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   int blk_stride      - 5

-; )

-global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    ; end prolog

-        mov         rdx,            arg(1) ; dequant

-        mov         rax,            arg(0) ; qcoeff

-        movd        xmm4,           [rax]

-        movd        xmm5,           [rdx]

-        pinsrw      xmm4,           [rax+32],   4

-        pinsrw      xmm5,           [rdx],      4

-        pmullw      xmm4,           xmm5

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; clear coeffs

-        movd        [rax],          xmm5

-        movd        [rax+32],       xmm5

-;pshufb

-        pshuflw     xmm4,           xmm4,       00000000b

-        pshufhw     xmm4,           xmm4,       00000000b

-        mov         rax,            arg(2) ; pre

-        paddw       xmm4,           [GLOBAL(fours)]

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-        psraw       xmm4,           3

-        movq        xmm0,           [rax]

-        movq        xmm1,           [rax+rcx]

-        movq        xmm2,           [rax+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm3,           [rax+rcx]

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-        mov         rax,            arg(3) ; dst

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; store blocks back out

-        movq        [rax],          xmm0

-        movq        [rax + rdx],    xmm1

-        lea         rax,            [rax + 2*rdx]

-        movq        [rax],          xmm2

-        movq        [rax + rdx],    xmm3

-    ; begin epilog

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_idct_dequant_dc_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   short *dc           - 5

-; )

-global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_dc_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        mov         rdx,            arg(5) ; dc

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; load up 2 dc words here == 2*16 = doubleword

-        movd        xmm4,           [rdx]

-    ; Load up predict blocks

-        movq        xmm0,           [rsi]

-        movq        xmm1,           [rsi+16]

-        movq        xmm2,           [rsi+32]

-        movq        xmm3,           [rsi+48]

-    ; Duplicate and expand dc across

-        punpcklwd   xmm4,           xmm4

-        punpckldq   xmm4,           xmm4

-    ; Rounding to dequant and downshift

-        paddw       xmm4,           [GLOBAL(fours)]

-        psraw       xmm4,           3

-    ; Predict buffer needs to be expanded from bytes to words

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_dc_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; DC component

-        mov         rdx,            arg(5)

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; insert DC component

-        pinsrw      xmm0,           [rdx],      0

-        pinsrw      xmm0,           [rdx+2],    4

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+16]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+32]

-        movq        xmm5,           [rsi+48]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-fours:

-    times 8 dw 0x0004

-align 16

-x_s1sqr2:

-    times 8 dw 0x8A8C

-align 16

-x_c1sqr2less1:

-    times 8 dw 0x4E7B

--- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c

+++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c

@@ -35,16 +35,6 @@

-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,

-                             const unsigned char *blimit) {

-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,

-                                             y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,

-                                             y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,

-                                             y_stride, blimit);

-}

 /* Vertical B Filtering */

 void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,

                             unsigned char *u_ptr, unsigned char *v_ptr,

@@ -66,9 +56,3 @@

                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);

-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,

-                             const unsigned char *blimit) {

-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);

-}

--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

@@ -1115,16 +1115,6 @@

                                             v_ptr + 4 * uv_stride);

-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,

-                              const unsigned char *blimit) {

-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,

-                                              y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,

-                                              y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,

-                                              y_stride, blimit);

-}

 /* Vertical B Filtering */

 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,

                              unsigned char *u_ptr, unsigned char *v_ptr,

@@ -1143,9 +1133,3 @@

                                           v_ptr + 4);

-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,

-                              const unsigned char *blimit) {

-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);

-}

--- a/vp9/common/x86/vp9_loopfilter_mmx.asm

+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm

@@ -593,349 +593,6 @@

     pop         rbp

ret

-;void vp9_loop_filter_simple_horizontal_edge_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit

-;)

-global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE

-sym(vp9_loop_filter_simple_horizontal_edge_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi, arg(0) ;src_ptr

-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        mov         rcx, 2                ; count

-.nexts8_h:

-        mov         rdx, arg(2) ;blimit           ; get blimit

-        movq        mm3, [rdx]            ;

-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing

-        add         rdi, rax

-        neg         rax

-        ; calculate mask

-        movq        mm1, [rsi+2*rax]      ; p1

-        movq        mm0, [rdi]            ; q1

-        movq        mm2, mm1

-        movq        mm7, mm0

-        movq        mm4, mm0

-        psubusb     mm0, mm1              ; q1-=p1

-        psubusb     mm1, mm4              ; p1-=q1

-        por         mm1, mm0              ; abs(p1-q1)

-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero

-        psrlw       mm1, 1                ; abs(p1-q1)/2

-        movq        mm5, [rsi+rax]        ; p0

-        movq        mm4, [rsi]            ; q0

-        movq        mm0, mm4              ; q0

-        movq        mm6, mm5              ; p0

-        psubusb     mm5, mm4              ; p0-=q0

-        psubusb     mm4, mm6              ; q0-=p0

-        por         mm5, mm4              ; abs(p0 - q0)

-        paddusb     mm5, mm5              ; abs(p0-q0)*2

-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        mm3, mm3

-        pcmpeqb     mm5, mm3

-        ; start work on filters

-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values

-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values

-        psubsb      mm2, mm7              ; p1 - q1

-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values

-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values

-        movq        mm3, mm0              ; q0

-        psubsb      mm0, mm6              ; q0 - p0

-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)

-        pand        mm5, mm2              ; mask filter values we don't care about

-        ; do + 4 side

-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4

-        movq        mm0, mm5              ; get a copy of filters

-        psllw       mm0, 8                ; shift left 8

-        psraw       mm0, 3                ; arithmetic shift right 11

-        psrlw       mm0, 8

-        movq        mm1, mm5              ; get a copy of filters

-        psraw       mm1, 11               ; arithmetic shift right 11

-        psllw       mm1, 8                ; shift left 8 to put it back

-        por         mm0, mm1              ; put the two together to get result

-        psubsb      mm3, mm0              ; q0-= q0 add

-        pxor        mm3, [GLOBAL(t80)]    ; unoffset

-        movq        [rsi], mm3            ; write back

-        ; now do +3 side

-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4

-        movq        mm0, mm5              ; get a copy of filters

-        psllw       mm0, 8                ; shift left 8

-        psraw       mm0, 3                ; arithmetic shift right 11

-        psrlw       mm0, 8

-        psraw       mm5, 11               ; arithmetic shift right 11

-        psllw       mm5, 8                ; shift left 8 to put it back

-        por         mm0, mm5              ; put the two together to get result

-        paddsb      mm6, mm0              ; p0+= p0 add

-        pxor        mm6, [GLOBAL(t80)]    ; unoffset

-        movq        [rsi+rax], mm6        ; write back

-        add         rsi,8

-        neg         rax

-        dec         rcx

-        jnz         .nexts8_h

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_simple_vertical_edge_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit

-;)

-global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE

-sym(vp9_loop_filter_simple_vertical_edge_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub          rsp, 32      ; reserve 32 bytes

-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];

-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];

-        mov         rsi, arg(0) ;src_ptr

-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        lea         rsi, [rsi + rax*4- 2];  ;

-        mov         rcx, 2                                      ; count

-.nexts8_v:

-        lea         rdi,        [rsi + rax];

-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70

-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60

-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60

-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50

-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40

-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40

-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40

-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40

-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42

-        neg         rax

-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30

-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20

-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20

-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10

-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00

-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00

-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00

-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00

-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02

-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00

-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1

-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02

-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0

-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0

-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1

-        ; calculate mask

-        movq        mm6,        mm0                             ; p1

-        movq        mm7,        mm3                             ; q1

-        psubusb     mm7,        mm6                             ; q1-=p1

-        psubusb     mm6,        mm3                             ; p1-=q1

-        por         mm6,        mm7                             ; abs(p1-q1)

-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero

-        psrlw       mm6,        1                               ; abs(p1-q1)/2

-        movq        mm5,        mm1                             ; p0

-        movq        mm4,        mm2                             ; q0

-        psubusb     mm5,        mm2                             ; p0-=q0

-        psubusb     mm4,        mm1                             ; q0-=p0

-        por         mm5,        mm4                             ; abs(p0 - q0)

-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2

-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        mov         rdx,        arg(2) ;blimit                          ; get blimit

-        movq        mm7,        [rdx]

-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        mm7,        mm7

-        pcmpeqb     mm5,        mm7                             ; mm5 = mask

-        ; start work on filters

-        movq        t0,         mm0

-        movq        t1,         mm3

-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values

-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values

-        psubsb      mm0,        mm3                             ; p1 - q1

-        movq        mm6,        mm1                             ; p0

-        movq        mm7,        mm2                             ; q0

-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values

-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values

-        movq        mm3,        mm7                             ; offseted ; q0

-        psubsb      mm7,        mm6                             ; q0 - p0

-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)

-        pand        mm5,        mm0                             ; mask filter values we don't care about

-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4

-        movq        mm0,        mm5                             ; get a copy of filters

-        psllw       mm0,        8                               ; shift left 8

-        psraw       mm0,        3                               ; arithmetic shift right 11

-        psrlw       mm0,        8

-        movq        mm7,        mm5                             ; get a copy of filters

-        psraw       mm7,        11                              ; arithmetic shift right 11

-        psllw       mm7,        8                               ; shift left 8 to put it back

-        por         mm0,        mm7                             ; put the two together to get result

-        psubsb      mm3,        mm0                             ; q0-= q0sz add

-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset

-        ; now do +3 side

-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4

-        movq        mm0, mm5                                    ; get a copy of filters

-        psllw       mm0, 8                                      ; shift left 8

-        psraw       mm0, 3                                      ; arithmetic shift right 11

-        psrlw       mm0, 8

-        psraw       mm5, 11                                     ; arithmetic shift right 11

-        psllw       mm5, 8                                      ; shift left 8 to put it back

-        por         mm0, mm5                                    ; put the two together to get result

-        paddsb      mm6, mm0                                    ; p0+= p0 add

-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset

-        movq        mm0,        t0

-        movq        mm4,        t1

-        ; mm0 = 70 60 50 40 30 20 10 00

-        ; mm6 = 71 61 51 41 31 21 11 01

-        ; mm3 = 72 62 52 42 32 22 12 02

-        ; mm4 = 73 63 53 43 33 23 13 03

-        ; transpose back to write out

-        movq        mm1,        mm0                         ;

-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00

-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40

-        movq        mm2,        mm3                         ;

-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02

-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40

-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42

-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00

-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00

-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20

-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00

-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40

-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10

-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60

-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10

-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20

-        psrlq       mm6,        32                          ; 33 32 31 30

-        movd        [rsi],      mm1                         ; write 43 42 41 40

-        movd        [rsi + rax], mm6                        ; write 33 32 31 30

-        neg         rax

-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60

-        psrlq       mm1,        32                          ; 53 52 51 50

-        movd        [rdi],      mm1                         ; write out 53 52 51 50

-        psrlq       mm5,        32                          ; 73 72 71 70

-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70

-        lea         rsi,        [rsi+rax*8]                 ; next 8

-        dec         rcx

-        jnz         .nexts8_v

-    add rsp, 32

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,

-;                  int y_stride,

-;                  loop_filter_info *lfi)

-;{

-;

-;

-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

-;}

 SECTION_RODATA

 align 16

 tfe:

--- a/vp9/common/x86/vp9_loopfilter_sse2.asm

+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm

@@ -845,372 +845,6 @@

     pop         rbp

ret

-;void vp9_loop_filter_simple_horizontal_edge_sse2

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit,

-;)

-global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE

-sym(vp9_loop_filter_simple_horizontal_edge_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi, arg(0)             ;src_ptr

-        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?

-        mov         rdx, arg(2)             ;blimit

-        movdqa      xmm3, XMMWORD PTR [rdx]

-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing

-        add         rdi, rax

-        neg         rax

-        ; calculate mask

-        movdqa      xmm1, [rsi+2*rax]       ; p1

-        movdqa      xmm0, [rdi]             ; q1

-        movdqa      xmm2, xmm1

-        movdqa      xmm7, xmm0

-        movdqa      xmm4, xmm0

-        psubusb     xmm0, xmm1              ; q1-=p1

-        psubusb     xmm1, xmm4              ; p1-=q1

-        por         xmm1, xmm0              ; abs(p1-q1)

-        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero

-        psrlw       xmm1, 1                 ; abs(p1-q1)/2

-        movdqa      xmm5, [rsi+rax]         ; p0

-        movdqa      xmm4, [rsi]             ; q0

-        movdqa      xmm0, xmm4              ; q0

-        movdqa      xmm6, xmm5              ; p0

-        psubusb     xmm5, xmm4              ; p0-=q0

-        psubusb     xmm4, xmm6              ; q0-=p0

-        por         xmm5, xmm4              ; abs(p0 - q0)

-        paddusb     xmm5, xmm5              ; abs(p0-q0)*2

-        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        xmm3, xmm3

-        pcmpeqb     xmm5, xmm3

-        ; start work on filters

-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values

-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values

-        psubsb      xmm2, xmm7              ; p1 - q1

-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values

-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values

-        movdqa      xmm3, xmm0              ; q0

-        psubsb      xmm0, xmm6              ; q0 - p0

-        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)

-        pand        xmm5, xmm2              ; mask filter values we don't care about

-        ; do + 4 side

-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4

-        movdqa      xmm0, xmm5              ; get a copy of filters

-        psllw       xmm0, 8                 ; shift left 8

-        psraw       xmm0, 3                 ; arithmetic shift right 11

-        psrlw       xmm0, 8

-        movdqa      xmm1, xmm5              ; get a copy of filters

-        psraw       xmm1, 11                ; arithmetic shift right 11

-        psllw       xmm1, 8                 ; shift left 8 to put it back

-        por         xmm0, xmm1              ; put the two together to get result

-        psubsb      xmm3, xmm0              ; q0-= q0 add

-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset

-        movdqa      [rsi], xmm3             ; write back

-        ; now do +3 side

-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4

-        movdqa      xmm0, xmm5              ; get a copy of filters

-        psllw       xmm0, 8                 ; shift left 8

-        psraw       xmm0, 3                 ; arithmetic shift right 11

-        psrlw       xmm0, 8

-        psraw       xmm5, 11                ; arithmetic shift right 11

-        psllw       xmm5, 8                 ; shift left 8 to put it back

-        por         xmm0, xmm5              ; put the two together to get result

-        paddsb      xmm6, xmm0              ; p0+= p0 add

-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset

-        movdqa      [rsi+rax], xmm6         ; write back

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_simple_vertical_edge_sse2

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit,

-;)

-global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE

-sym(vp9_loop_filter_simple_vertical_edge_sse2):

-    push        rbp         ; save old base pointer value.

-    mov         rbp, rsp    ; set new base pointer value.

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 7

-    GET_GOT     rbx         ; save callee-saved reg

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 32                         ; reserve 32 bytes

-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];

-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];

-        mov         rsi, arg(0) ;src_ptr

-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        lea         rsi,        [rsi - 2 ]

-        lea         rdi,        [rsi + rax]

-        lea         rdx,        [rsi + rax*4]

-        lea         rcx,        [rdx + rax]

-        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00

-        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40

-        movd        xmm2,       [rdi]                   ; 13 12 11 10

-        movd        xmm3,       [rcx]                   ; 53 52 51 50

-        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00

-        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10

-        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20

-        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60

-        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30

-        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70

-        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20

-        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30

-        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00

-        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20

-        movdqa      xmm1,       xmm0

-        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00

-        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40

-        movdqa      xmm2,       xmm0

-        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

-        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        movdqa      t0,         xmm0                    ; save to t0

-        movdqa      t1,         xmm2                    ; save to t1

-        lea         rsi,        [rsi + rax*8]

-        lea         rdi,        [rsi + rax]

-        lea         rdx,        [rsi + rax*4]

-        lea         rcx,        [rdx + rax]

-        movd        xmm4,       [rsi]                   ; 83 82 81 80

-        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0

-        movd        xmm6,       [rdi]                   ; 93 92 91 90

-        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0

-        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80

-        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90

-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0

-        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0

-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0

-        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0

-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0

-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0

-        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80

-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0

-        movdqa      xmm1,       xmm4

-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80

-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0

-        movdqa      xmm6,       xmm4

-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80

-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82

-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        movdqa      xmm1,       xmm0

-        movdqa      xmm3,       xmm2

-        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

-        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

-        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

-        ; calculate mask

-        movdqa      xmm6,       xmm0                            ; p1

-        movdqa      xmm7,       xmm3                            ; q1

-        psubusb     xmm7,       xmm0                            ; q1-=p1

-        psubusb     xmm6,       xmm3                            ; p1-=q1

-        por         xmm6,       xmm7                            ; abs(p1-q1)

-        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero

-        psrlw       xmm6,       1                               ; abs(p1-q1)/2

-        movdqa      xmm5,       xmm1                            ; p0

-        movdqa      xmm4,       xmm2                            ; q0

-        psubusb     xmm5,       xmm2                            ; p0-=q0

-        psubusb     xmm4,       xmm1                            ; q0-=p0

-        por         xmm5,       xmm4                            ; abs(p0 - q0)

-        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2

-        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        mov         rdx,        arg(2)                          ;blimit

-        movdqa      xmm7, XMMWORD PTR [rdx]

-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        xmm7,        xmm7

-        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask

-        ; start work on filters

-        movdqa        t0,        xmm0

-        movdqa        t1,        xmm3

-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values

-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values

-        psubsb      xmm0,        xmm3                           ; p1 - q1

-        movdqa      xmm6,        xmm1                           ; p0

-        movdqa      xmm7,        xmm2                           ; q0

-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values

-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values

-        movdqa      xmm3,        xmm7                           ; offseted ; q0

-        psubsb      xmm7,        xmm6                           ; q0 - p0

-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)

-        pand        xmm5,        xmm0                           ; mask filter values we don't care about

-        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4

-        movdqa      xmm0,        xmm5                           ; get a copy of filters

-        psllw       xmm0,        8                              ; shift left 8

-        psraw       xmm0,        3                              ; arithmetic shift right 11

-        psrlw       xmm0,        8

-        movdqa      xmm7,        xmm5                           ; get a copy of filters

-        psraw       xmm7,        11                             ; arithmetic shift right 11

-        psllw       xmm7,        8                              ; shift left 8 to put it back

-        por         xmm0,        xmm7                           ; put the two together to get result

-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add

-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0

-        ; now do +3 side

-        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4

-        movdqa      xmm0,        xmm5                           ; get a copy of filters

-        psllw       xmm0,        8                              ; shift left 8

-        psraw       xmm0,        3                              ; arithmetic shift right 11

-        psrlw       xmm0,        8

-        psraw       xmm5,        11                             ; arithmetic shift right 11

-        psllw       xmm5,        8                              ; shift left 8 to put it back

-        por         xmm0,        xmm5                           ; put the two together to get result

-        paddsb      xmm6,        xmm0                           ; p0+= p0 add

-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0

-        movdqa      xmm0,        t0                             ; p1

-        movdqa      xmm4,        t1                             ; q1

-        ; transpose back to write out

-        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

-        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

-        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

-        movdqa      xmm1,       xmm0

-        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00

-        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80

-        movdqa      xmm5,       xmm3

-        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

-        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

-        movdqa      xmm2,       xmm0

-        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00

-        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40

-        movdqa      xmm3,       xmm1

-        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80

-        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0

-        ; write out order: xmm0 xmm2 xmm1 xmm3

-        lea         rdx,        [rsi + rax*4]

-        movd        [rsi],      xmm1                               ; write the second 8-line result

-        psrldq      xmm1,       4

-        movd        [rdi],      xmm1

-        psrldq      xmm1,       4

-        movd        [rsi + rax*2], xmm1

-        psrldq      xmm1,       4

-        movd        [rdi + rax*2], xmm1

-        movd        [rdx],      xmm3

-        psrldq      xmm3,       4

-        movd        [rcx],      xmm3

-        psrldq      xmm3,       4

-        movd        [rdx + rax*2], xmm3

-        psrldq      xmm3,       4

-        movd        [rcx + rax*2], xmm3

-        neg         rax

-        lea         rsi,        [rsi + rax*8]

-        neg         rax

-        lea         rdi,        [rsi + rax]

-        lea         rdx,        [rsi + rax*4]

-        lea         rcx,        [rdx + rax]

-        movd        [rsi],      xmm0                                ; write the first 8-line result

-        psrldq      xmm0,       4

-        movd        [rdi],      xmm0

-        psrldq      xmm0,       4

-        movd        [rsi + rax*2], xmm0

-        psrldq      xmm0,       4

-        movd        [rdi + rax*2], xmm0

-        movd        [rdx],      xmm2

-        psrldq      xmm2,       4

-        movd        [rcx],      xmm2

-        psrldq      xmm2,       4

-        movd        [rdx + rax*2], xmm2

-        psrldq      xmm2,       4

-        movd        [rcx + rax*2], xmm2

-    add rsp, 32

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 SECTION_RODATA

 align 16

 tfe:

--- a/vp9/common/x86/vp9_loopfilter_x86.h

+++ b/vp9/common/x86/vp9_loopfilter_x86.h

@@ -23,10 +23,6 @@

 extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);

 extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);

 extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);

 #endif

 #if HAVE_SSE2

@@ -34,10 +30,6 @@

 extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);

 extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);

 extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);

 #endif

 #endif  // LOOPFILTER_X86_H

--- a/vp9/common/x86/vp9_recon_mmx.asm

+++ b/vp9/common/x86/vp9_recon_mmx.asm

@@ -10,55 +10,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)

-global sym(vp9_recon_b_mmx) PRIVATE

-sym(vp9_recon_b_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov       rsi, arg(0) ;s

-        mov       rdi, arg(2) ;d

-        mov       rdx, arg(1) ;q

-        movsxd    rax, dword ptr arg(3) ;stride

-        pxor      mm0, mm0

-        movd      mm1, [rsi]

-        punpcklbw mm1, mm0

-        paddsw    mm1, [rdx]

-        packuswb  mm1,  mm0              ; pack and unpack to saturate

-        movd      [rdi], mm1

-        movd      mm2, [rsi+16]

-        punpcklbw mm2, mm0

-        paddsw    mm2, [rdx+32]

-        packuswb  mm2, mm0              ; pack and unpack to saturate

-        movd      [rdi+rax], mm2

-        movd      mm3, [rsi+32]

-        punpcklbw mm3, mm0

-        paddsw    mm3, [rdx+64]

-        packuswb  mm3,  mm0              ; pack and unpack to saturate

-        movd      [rdi+2*rax], mm3

-        add       rdi, rax

-        movd      mm4, [rsi+48]

-        punpcklbw mm4, mm0

-        paddsw    mm4, [rdx+96]

-        packuswb  mm4, mm0              ; pack and unpack to saturate

-        movd      [rdi+2*rax], mm4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 ;void copy_mem8x8_mmx(

 ;    unsigned char *src,

 ;    int src_stride,

--- a/vp9/common/x86/vp9_recon_sse2.asm

+++ b/vp9/common/x86/vp9_recon_sse2.asm

@@ -10,122 +10,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)

-global sym(vp9_recon2b_sse2) PRIVATE

-sym(vp9_recon2b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;s

-        mov         rdi,        arg(2) ;d

-        mov         rdx,        arg(1) ;q

-        movsxd      rax,        dword ptr arg(3) ;stride

-        pxor        xmm0,       xmm0

-        movq        xmm1,       MMWORD PTR [rsi]

-        punpcklbw   xmm1,       xmm0

-        paddsw      xmm1,       XMMWORD PTR [rdx]

-        packuswb    xmm1,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi],   xmm1

-        movq        xmm2,       MMWORD PTR [rsi+8]

-        punpcklbw   xmm2,       xmm0

-        paddsw      xmm2,       XMMWORD PTR [rdx+16]

-        packuswb    xmm2,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi+rax],   xmm2

-        movq        xmm3,       MMWORD PTR [rsi+16]

-        punpcklbw   xmm3,       xmm0

-        paddsw      xmm3,       XMMWORD PTR [rdx+32]

-        packuswb    xmm3,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi+rax*2], xmm3

-        add         rdi, rax

-        movq        xmm4,       MMWORD PTR [rsi+24]

-        punpcklbw   xmm4,       xmm0

-        paddsw      xmm4,       XMMWORD PTR [rdx+48]

-        packuswb    xmm4,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi+rax*2], xmm4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)

-global sym(vp9_recon4b_sse2) PRIVATE

-sym(vp9_recon4b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;s

-        mov         rdi,        arg(2) ;d

-        mov         rdx,        arg(1) ;q

-        movsxd      rax,        dword ptr arg(3) ;stride

-        pxor        xmm0,       xmm0

-        movdqa      xmm1,       XMMWORD PTR [rsi]

-        movdqa      xmm5,       xmm1

-        punpcklbw   xmm1,       xmm0

-        punpckhbw   xmm5,       xmm0

-        paddsw      xmm1,       XMMWORD PTR [rdx]

-        paddsw      xmm5,       XMMWORD PTR [rdx+16]

-        packuswb    xmm1,       xmm5              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi],  xmm1

-        movdqa      xmm2,       XMMWORD PTR [rsi+16]

-        movdqa      xmm6,       xmm2

-        punpcklbw   xmm2,       xmm0

-        punpckhbw   xmm6,       xmm0

-        paddsw      xmm2,       XMMWORD PTR [rdx+32]

-        paddsw      xmm6,       XMMWORD PTR [rdx+48]

-        packuswb    xmm2,       xmm6              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi+rax],  xmm2

-        movdqa      xmm3,       XMMWORD PTR [rsi+32]

-        movdqa      xmm7,       xmm3

-        punpcklbw   xmm3,       xmm0

-        punpckhbw   xmm7,       xmm0

-        paddsw      xmm3,       XMMWORD PTR [rdx+64]

-        paddsw      xmm7,       XMMWORD PTR [rdx+80]

-        packuswb    xmm3,       xmm7              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3

-        add       rdi, rax

-        movdqa      xmm4,       XMMWORD PTR [rsi+48]

-        movdqa      xmm5,       xmm4

-        punpcklbw   xmm4,       xmm0

-        punpckhbw   xmm5,       xmm0

-        paddsw      xmm4,       XMMWORD PTR [rdx+96]

-        paddsw      xmm5,       XMMWORD PTR [rdx+112]

-        packuswb    xmm4,       xmm5              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 ;void copy_mem16x16_sse2(

 ;    unsigned char *src,

 ;    int src_stride,

--- a/vp9/common/x86/vp9_recon_wrapper_sse2.c

+++ b/vp9/common/x86/vp9_recon_wrapper_sse2.c

@@ -35,7 +35,7 @@

                                             build_intra_pred_mbuv_fn_t ho_fn) {

   int mode = xd->mode_info_context->mbmi.uv_mode;

   build_intra_pred_mbuv_fn_t fn;

-  int src_stride = xd->dst.uv_stride;

+  int src_stride = xd->plane[1].dst.stride;

   switch (mode) {

     case  V_PRED:

@@ -68,34 +68,34 @@

       return;

-  fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);

-  fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);

+  fn(dst_u, dst_stride, xd->plane[1].dst.buf, src_stride);

+  fn(dst_v, dst_stride, xd->plane[2].dst.buf, src_stride);

 void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],

-                                  &xd->predictor[320], 8,

+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,

+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,

                                   vp9_intra_pred_uv_tm_sse2,

                                   vp9_intra_pred_uv_ho_mmx2);

 void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],

-                                  &xd->predictor[320], 8,

+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,

+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,

                                   vp9_intra_pred_uv_tm_ssse3,

                                   vp9_intra_pred_uv_ho_ssse3);

 void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,

-                                  xd->dst.v_buffer, xd->dst.uv_stride,

+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,

+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,

                                   vp9_intra_pred_uv_tm_sse2,

                                   vp9_intra_pred_uv_ho_mmx2);

 void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,

-                                  xd->dst.v_buffer, xd->dst.uv_stride,

+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,

+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,

                                   vp9_intra_pred_uv_tm_ssse3,

                                   vp9_intra_pred_uv_ho_ssse3);

--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

@@ -81,10 +81,10 @@

     pmaddubsw   xmm4, k4k5

     pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm6

     paddsw      xmm0, xmm2

-    paddsw      xmm0, krd

-    paddsw      xmm4, xmm6

     paddsw      xmm0, xmm4

+    paddsw      xmm0, krd

     psraw       xmm0, 7

     packuswb    xmm0, xmm0

@@ -165,10 +165,10 @@

     pmaddubsw   xmm4, k4k5

     pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm6

     paddsw      xmm0, xmm2

-    paddsw      xmm0, krd

-    paddsw      xmm4, xmm6

     paddsw      xmm0, xmm4

+    paddsw      xmm0, krd

     psraw       xmm0, 7

     packuswb    xmm0, xmm0

@@ -250,10 +250,10 @@

     pmaddubsw   xmm4, k4k5

     pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm6

     paddsw      xmm0, xmm2

-    paddsw      xmm0, krd

-    paddsw      xmm4, xmm6

     paddsw      xmm0, xmm4

+    paddsw      xmm0, krd

     psraw       xmm0, 7

     packuswb    xmm0, xmm0

@@ -285,10 +285,10 @@

     pmaddubsw   xmm4, k4k5

     pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm6

     paddsw      xmm0, xmm2

-    paddsw      xmm4, xmm6

-    paddsw      xmm0, krd

     paddsw      xmm0, xmm4

+    paddsw      xmm0, krd

     psraw       xmm0, 7

     packuswb    xmm0, xmm0

--- a/vp9/common/x86/vp9_subpixel_variance_sse2.c

+++ b/vp9/common/x86/vp9_subpixel_variance_sse2.c

@@ -43,48 +43,3 @@

                                      int  yoffset,

                                      int *sum,

                                      unsigned int *sumsquared);

-unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char  *src_ptr,

-                                             int  src_pixels_per_line,

-                                             int  xoffset,

-                                             int  yoffset,

-                                             const unsigned char *dst_ptr,

-                                             int dst_pixels_per_line,

-                                             unsigned int *sse) {

-  int xsum0, xsum1;

-  unsigned int xxsum0, xxsum1;

-  if (xoffset == HALFNDX && yoffset == 0) {

-    vp9_half_horiz_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 2,

-      &xsum0, &xxsum0);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    vp9_half_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 2,

-      &xsum0, &xxsum0);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    vp9_half_horiz_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 2,

-      &xsum0, &xxsum0);

-  } else {

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 2,

-      xoffset, yoffset,

-      &xsum0, &xxsum0);

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr + 8, src_pixels_per_line,

-      dst_ptr + 8, dst_pixels_per_line, 2,

-      xoffset, yoffset,

-      &xsum1, &xxsum1);

-    xsum0 += xsum1;

-    xxsum0 += xxsum1;

-  }

-  *sse = xxsum0;

-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5));

-}

--- a/vp9/decoder/vp9_dboolhuff.c

+++ b/vp9/decoder/vp9_dboolhuff.c

@@ -13,34 +13,32 @@

 #include "vp9/decoder/vp9_dboolhuff.h"

-int vp9_start_decode(BOOL_DECODER *br,

-                     const unsigned char *source,

-                     unsigned int source_sz) {

-  br->user_buffer_end = source + source_sz;

-  br->user_buffer = source;

-  br->value = 0;

-  br->count = -8;

-  br->range = 255;

+int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {

+  int marker_bit;

-  if (source_sz && !source)

+  r->buffer_end = buffer + size;

+  r->buffer = buffer;

+  r->value = 0;

+  r->count = -8;

+  r->range = 255;

+  if (size && !buffer)

     return 1;

-  /* Populate the buffer */

-  vp9_bool_decoder_fill(br);

-  return 0;

+  vp9_reader_fill(r);

+  marker_bit = vp9_read_bit(r);

+  return marker_bit != 0;

-void vp9_bool_decoder_fill(BOOL_DECODER *br) {

-  const unsigned char *bufptr = br->user_buffer;

-  const unsigned char *bufend = br->user_buffer_end;

-  VP9_BD_VALUE value = br->value;

-  int count = br->count;

+void vp9_reader_fill(vp9_reader *r) {

+  const uint8_t *const buffer_end = r->buffer_end;

+  const uint8_t *buffer = r->buffer;

+  VP9_BD_VALUE value = r->value;

+  int count = r->count;

   int shift = VP9_BD_VALUE_SIZE - 8 - (count + 8);

   int loop_end = 0;

-  int bits_left = (int)((bufend - bufptr)*CHAR_BIT);

-  int x = shift + CHAR_BIT - bits_left;

+  const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT);

+  const int x = shift + CHAR_BIT - bits_left;

   if (x >= 0) {

     count += VP9_LOTS_OF_BITS;

@@ -50,79 +48,22 @@

   if (x < 0 || bits_left) {

     while (shift >= loop_end) {

       count += CHAR_BIT;

-      value |= (VP9_BD_VALUE)*bufptr++ << shift;

+      value |= (VP9_BD_VALUE)*buffer++ << shift;

       shift -= CHAR_BIT;

-  br->user_buffer = bufptr;

-  br->value = value;

-  br->count = count;

+  r->buffer = buffer;

+  r->value = value;

+  r->count = count;

-static int get_unsigned_bits(unsigned num_values) {

-  int cat = 0;

-  if (num_values <= 1)

-    return 0;

-  num_values--;

-  while (num_values > 0) {

-    cat++;

-    num_values >>= 1;

+const uint8_t *vp9_reader_find_end(vp9_reader *r) {

+  // Find the end of the coded buffer

+  while (r->count > CHAR_BIT && r->count < VP9_BD_VALUE_SIZE) {

+    r->count -= CHAR_BIT;

+    r->buffer--;

-  return cat;

+  return r->buffer;

-int vp9_inv_recenter_nonneg(int v, int m) {

-  if (v > (m << 1))

-    return v;

-  else if ((v & 1) == 0)

-    return (v >> 1) + m;

-  else

-    return m - ((v + 1) >> 1);

-}

-int vp9_decode_uniform(BOOL_DECODER *br, int n) {

-  int v;

-  int l = get_unsigned_bits(n);

-  int m = (1 << l) - n;

-  if (!l) return 0;

-  v = decode_value(br, l - 1);

-  if (v < m)

-    return v;

-  else

-    return (v << 1) - m + decode_value(br, 1);

-}

-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {

-  int i = 0, mk = 0, word;

-  while (1) {

-    int b = (i ? k + i - 1 : k);

-    int a = (1 << b);

-    if (num_syms <= mk + 3 * a) {

-      word = vp9_decode_uniform(br, num_syms - mk) + mk;

-      break;

-    } else {

-      if (decode_value(br, 1)) {

-        i++;

-        mk += a;

-      } else {

-        word = decode_value(br, b) + mk;

-        break;

-      }

-    }

-  }

-  return word;

-}

-int vp9_decode_unsigned_max(BOOL_DECODER *br, int max) {

-  int data = 0, bit = 0, lmax = max;

-  while (lmax) {

-    data |= decode_bool(br, 128) << bit++;

-    lmax >>= 1;

-  }

-  if (data > max)

-    return max;

-  return data;

-}

--- a/vp9/decoder/vp9_dboolhuff.h

+++ b/vp9/decoder/vp9_dboolhuff.h

@@ -21,32 +21,29 @@

 typedef size_t VP9_BD_VALUE;

 #define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)

-/*This is meant to be a large, positive constant that can still be efficiently

-   loaded as an immediate (on platforms like ARM, for example).

-  Even relatively modest values like 100 would work fine.*/

-#define VP9_LOTS_OF_BITS (0x40000000)

+// This is meant to be a large, positive constant that can still be efficiently

+// loaded as an immediate (on platforms like ARM, for example).

+// Even relatively modest values like 100 would work fine.

+#define VP9_LOTS_OF_BITS 0x40000000

 typedef struct {

-  const unsigned char *user_buffer_end;

-  const unsigned char *user_buffer;

-  VP9_BD_VALUE         value;

-  int                  count;

-  unsigned int         range;

-} BOOL_DECODER;

+  const uint8_t *buffer_end;

+  const uint8_t *buffer;

+  VP9_BD_VALUE value;

+  int count;

+  unsigned int range;

+} vp9_reader;

 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);

-int vp9_start_decode(BOOL_DECODER *br,

-                     const unsigned char *source,

-                     unsigned int source_sz);

+int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size);

-void vp9_bool_decoder_fill(BOOL_DECODER *br);

+void vp9_reader_fill(vp9_reader *r);

-int vp9_decode_uniform(BOOL_DECODER *br, int n);

-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);

-int vp9_inv_recenter_nonneg(int v, int m);

+const uint8_t *vp9_reader_find_end(vp9_reader *r);

-static int decode_bool(BOOL_DECODER *br, int probability) {

+static int vp9_read(vp9_reader *br, int probability) {

   unsigned int bit = 0;

   VP9_BD_VALUE value;

   VP9_BD_VALUE bigsplit;

@@ -55,7 +52,7 @@

   unsigned int split = 1 + (((br->range - 1) * probability) >> 8);

   if (br->count < 0)

-    vp9_bool_decoder_fill(br);

+    vp9_reader_fill(br);

   value = br->value;

   count = br->count;

@@ -83,18 +80,20 @@

   return bit;

-static int decode_value(BOOL_DECODER *br, int bits) {

-  int z = 0;

-  int bit;

+static int vp9_read_bit(vp9_reader *r) {

+  return vp9_read(r, 128);  // vp9_prob_half

+}

-  for (bit = bits - 1; bit >= 0; bit--) {

-    z |= decode_bool(br, 0x80) << bit;

-  }

+static int vp9_read_literal(vp9_reader *br, int bits) {

+  int z = 0, bit;

+  for (bit = bits - 1; bit >= 0; bit--)

+    z |= vp9_read_bit(br) << bit;

   return z;

-static int bool_error(BOOL_DECODER *br) {

+static int vp9_reader_has_error(vp9_reader *r) {

   // Check if we have reached the end of the buffer.

//

   // Variable 'count' stores the number of bits in the 'value' buffer, minus

@@ -109,9 +108,7 @@

//

   // 1 if we have tried to decode bits after the end of stream was encountered.

   // 0 No error.

-  return br->count > VP9_BD_VALUE_SIZE && br->count < VP9_LOTS_OF_BITS;

+  return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS;

-int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);

 #endif  // VP9_DECODER_VP9_DBOOLHUFF_H_

--- a/vp9/decoder/vp9_decodemv.c

+++ b/vp9/decoder/vp9_decodemv.c

@@ -20,6 +20,7 @@

 #include "vp9/common/vp9_pred_common.h"

 #include "vp9/common/vp9_entropy.h"

 #include "vp9/decoder/vp9_decodemv.h"

+#include "vp9/decoder/vp9_decodframe.h"

 #include "vp9/common/vp9_mvref_common.h"

 #if CONFIG_DEBUG

 #include <assert.h>

@@ -35,202 +36,138 @@

 extern int dec_debug;

 #endif

-static B_PREDICTION_MODE read_bmode(vp9_reader *bc, const vp9_prob *p) {

-  B_PREDICTION_MODE m = treed_read(bc, vp9_bmode_tree, p);

-#if CONFIG_NEWBINTRAMODES

-  if (m == B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS)

-    m = B_CONTEXT_PRED;

-  assert(m < B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS || m == B_CONTEXT_PRED);

-#endif

+static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {

+  MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p);

   return m;

-static B_PREDICTION_MODE read_kf_bmode(vp9_reader *bc, const vp9_prob *p) {

-  return (B_PREDICTION_MODE)treed_read(bc, vp9_kf_bmode_tree, p);

+static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {

+  return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);

-static MB_PREDICTION_MODE read_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_ymode_tree, p);

-}

+static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,

+                           int mi_row, int mi_col, int segment_id) {

+  const int mi_index = mi_row * cm->mi_cols + mi_col;

+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;

+  const int bw = 1 << mi_width_log2(sb_type);

+  const int bh = 1 << mi_height_log2(sb_type);

+  const int ymis = MIN(cm->mi_rows - mi_row, bh);

+  const int xmis = MIN(cm->mi_cols - mi_col, bw);

+  int x, y;

-static MB_PREDICTION_MODE read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_sb_ymode_tree, p);

-}

-static MB_PREDICTION_MODE read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);

-}

-static MB_PREDICTION_MODE read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_kf_ymode_tree, p);

-}

-static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_i8x8_mode_tree, p);

-}

-static MB_PREDICTION_MODE read_uv_mode(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);

-}

-// This function reads the current macro block's segnent id from the bitstream

-// It should only be called if a segment map update is indicated.

-static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *xd) {

-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

-    const vp9_prob *const p = xd->mb_segment_tree_probs;

-    mi->segment_id = vp9_read(r, p[0]) ? 2 + vp9_read(r, p[2])

-                                       : vp9_read(r, p[1]);

+  for (y = 0; y < ymis; y++) {

+    for (x = 0; x < xmis; x++) {

+      const int index = mi_index + (y * cm->mi_cols + x);

+      cm->last_frame_seg_map[index] = segment_id;

+    }

-// This function reads the current macro block's segnent id from the bitstream

-// It should only be called if a segment map update is indicated.

-static void read_mb_segid_except(VP9_COMMON *cm,

-                                 vp9_reader *r, MB_MODE_INFO *mi,

-                                 MACROBLOCKD *xd, int mb_row, int mb_col) {

-  const int mb_index = mb_row * cm->mb_cols + mb_col;

-  const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, mb_index);

-  const vp9_prob *const p = xd->mb_segment_tree_probs;

-  const vp9_prob prob = xd->mb_segment_mispred_tree_probs[pred_seg_id];

-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

-    mi->segment_id = vp9_read(r, prob)

-        ? 2 + (pred_seg_id  < 2 ? vp9_read(r, p[2]) : (pred_seg_id == 2))

-        :     (pred_seg_id >= 2 ? vp9_read(r, p[1]) : (pred_seg_id == 0));

+static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,

+                                vp9_reader *r, BLOCK_SIZE_TYPE bsize) {

+  const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);

+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);

+  TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);

+  if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {

+    txfm_size += vp9_read(r, tx_probs[1]);

+    if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)

+      txfm_size += vp9_read(r, tx_probs[2]);

-}

-#if CONFIG_NEW_MVREF

-int vp9_read_mv_ref_id(vp9_reader *r, vp9_prob *ref_id_probs) {

-  int ref_index = 0;

-  if (vp9_read(r, ref_id_probs[0])) {

-    ref_index++;

-    if (vp9_read(r, ref_id_probs[1])) {

-      ref_index++;

-      if (vp9_read(r, ref_id_probs[2]))

-        ref_index++;

-    }

+  if (bsize >= BLOCK_SIZE_SB32X32) {

+    cm->fc.tx_count_32x32p[context][txfm_size]++;

+  } else if (bsize >= BLOCK_SIZE_MB16X16) {

+    cm->fc.tx_count_16x16p[context][txfm_size]++;

+  } else {

+    cm->fc.tx_count_8x8p[context][txfm_size]++;

-  return ref_index;

+  return txfm_size;

-#endif

-extern const int vp9_i8x8_block[4];

-static void kfread_modes(VP9D_COMP *pbi,

-                         MODE_INFO *m,

-                         int mb_row,

-                         int mb_col,

-                         BOOL_DECODER* const bc) {

+static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,

+                         int mi_row, int mi_col,

+                         vp9_reader *r) {

   VP9_COMMON *const cm = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  const int mis = pbi->common.mode_info_stride;

-  int map_index = mb_row * pbi->common.mb_cols + mb_col;

-  MB_PREDICTION_MODE y_mode;

+  MACROBLOCKD *const xd = &pbi->mb;

+  const int mis = cm->mode_info_stride;

-  m->mbmi.ref_frame = INTRA_FRAME;

-  // Read the Macroblock segmentation map if it is being updated explicitly

-  // this frame (reset to 0 by default).

+  // Read segmentation map if it is being updated explicitly this frame

   m->mbmi.segment_id = 0;

-  if (pbi->mb.update_mb_segmentation_map) {

-    read_mb_segid(bc, &m->mbmi, &pbi->mb);

-    if (m->mbmi.sb_type) {

-      const int nmbs = 1 << m->mbmi.sb_type;

-      const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);

-      const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);

-      int x, y;

-      for (y = 0; y < ymbs; y++) {

-        for (x = 0; x < xmbs; x++) {

-          cm->last_frame_seg_map[map_index + x + y * cm->mb_cols] =

-              m->mbmi.segment_id;

-        }

-      }

-    } else {

-      cm->last_frame_seg_map[map_index] = m->mbmi.segment_id;

-    }

+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

+    m->mbmi.segment_id = read_mb_segid(r, xd);

+    set_segment_id(cm, &m->mbmi, mi_row, mi_col, m->mbmi.segment_id);

-  m->mbmi.mb_skip_coeff = 0;

-  if (pbi->common.mb_no_coeff_skip &&

-      (!vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id, SEG_LVL_SKIP))) {

-    m->mbmi.mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, &pbi->mb,

-                                                           PRED_MBSKIP));

-  } else {

-    m->mbmi.mb_skip_coeff = vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id,

-                                                  SEG_LVL_SKIP);

+  m->mbmi.mb_skip_coeff = vp9_segfeature_active(xd, m->mbmi.segment_id,

+                                                SEG_LVL_SKIP);

+  if (!m->mbmi.mb_skip_coeff) {

+    m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

+    cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]

+                       [m->mbmi.mb_skip_coeff]++;

-  y_mode = m->mbmi.sb_type ?

-      read_kf_sb_ymode(bc,

-          pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]):

-      read_kf_mb_ymode(bc,

-          pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

-  m->mbmi.ref_frame = INTRA_FRAME;

-  if ((m->mbmi.mode = y_mode) == B_PRED) {

-    int i = 0;

-    do {

-      const B_PREDICTION_MODE a = above_block_mode(m, i, mis);

-      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?

-                                  left_block_mode(m, i) : B_DC_PRED;

-      m->bmi[i].as_mode.first = read_kf_bmode(bc,

-                                              pbi->common.kf_bmode_prob[a][l]);

-    } while (++i < 16);

-  }

-  if ((m->mbmi.mode = y_mode) == I8X8_PRED) {

-    int i;

-    for (i = 0; i < 4; i++) {

-      const int ib = vp9_i8x8_block[i];

-      const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

-      m->bmi[ib + 0].as_mode.first = mode8x8;

-      m->bmi[ib + 1].as_mode.first = mode8x8;

-      m->bmi[ib + 4].as_mode.first = mode8x8;

-      m->bmi[ib + 5].as_mode.first = mode8x8;

-    }

-  } else {

-    m->mbmi.uv_mode = read_uv_mode(bc,

-                                   pbi->common.kf_uv_mode_prob[m->mbmi.mode]);

-  }

   if (cm->txfm_mode == TX_MODE_SELECT &&

-      m->mbmi.mb_skip_coeff == 0 &&

-      m->mbmi.mode <= I8X8_PRED) {

-    // FIXME(rbultje) code ternary symbol once all experiments are merged

-    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);

-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {

-      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);

-      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.sb_type)

-        m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]);

-    }

-  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.sb_type) {

+      m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {

+    m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type);

+  } else if (cm->txfm_mode >= ALLOW_32X32 &&

+             m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {

     m->mbmi.txfm_size = TX_32X32;

-  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {

+  } else if (cm->txfm_mode >= ALLOW_16X16 &&

+             m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) {

     m->mbmi.txfm_size = TX_16X16;

-  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {

+  } else if (cm->txfm_mode >= ALLOW_8X8 &&

+             m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {

     m->mbmi.txfm_size = TX_8X8;

   } else {

     m->mbmi.txfm_size = TX_4X4;

+  // luma mode

+  m->mbmi.ref_frame[0] = INTRA_FRAME;

+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {

+    const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);

+    const MB_PREDICTION_MODE L = xd->left_available ?

+                                  left_block_mode(m, 0) : DC_PRED;

+    m->mbmi.mode = read_intra_mode(r, cm->kf_y_mode_prob[A][L]);

+  } else {

+    int idx, idy;

+    int bw = 1 << b_width_log2(m->mbmi.sb_type);

+    int bh = 1 << b_height_log2(m->mbmi.sb_type);

+    for (idy = 0; idy < 2; idy += bh) {

+      for (idx = 0; idx < 2; idx += bw) {

+        int ib = idy * 2 + idx;

+        int k;

+        const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);

+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?

+                                      left_block_mode(m, ib) : DC_PRED;

+        m->bmi[ib].as_mode.first =

+            read_intra_mode(r, cm->kf_y_mode_prob[A][L]);

+        for (k = 1; k < bh; ++k)

+          m->bmi[ib + k * 2].as_mode.first = m->bmi[ib].as_mode.first;

+        for (k = 1; k < bw; ++k)

+          m->bmi[ib + k].as_mode.first = m->bmi[ib].as_mode.first;

+      }

+    }

+    m->mbmi.mode = m->bmi[3].as_mode.first;

+  }

+  m->mbmi.uv_mode = read_intra_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);

-static int read_nmv_component(vp9_reader *r,

-                              int rv,

-                              const nmv_component *mvcomp) {

-  int mag, d;

+static int read_mv_component(vp9_reader *r,

+                             const nmv_component *mvcomp, int usehp) {

+  int mag, d, fr, hp;

   const int sign = vp9_read(r, mvcomp->sign);

   const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);

+  // Integer part

   if (mv_class == MV_CLASS_0) {

     d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);

   } else {

     int i;

-    int n = mv_class + CLASS0_BITS - 1;  // number of bits

+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits

     d = 0;

     for (i = 0; i < n; ++i)

@@ -237,235 +174,125 @@

       d |= vp9_read(r, mvcomp->bits[i]) << i;

-  mag = vp9_get_mv_mag(mv_class, d << 3);

-  return sign ? -(mag + 8) : (mag + 8);

-}

+  // Fractional part

+  fr = treed_read(r, vp9_mv_fp_tree,

+                  mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp);

-static int read_nmv_component_fp(vp9_reader *r,

-                                 int v,

-                                 int rv,

-                                 const nmv_component *mvcomp,

-                                 int usehp) {

-  const int sign = v < 0;

-  int mag = ((sign ? -v : v) - 1) & ~7;  // magnitude - 1

-  int offset;

-  const int mv_class = vp9_get_mv_class(mag, &offset);

-  const int f = mv_class == MV_CLASS_0 ?

-      treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[offset >> 3]):

-      treed_read(r, vp9_mv_fp_tree, mvcomp->fp);

-  offset += f << 1;

+  // High precision part (if hp is not used, the default value of the hp is 1)

+  hp = usehp ? vp9_read(r,

+                        mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp)

+             : 1;

-  if (usehp) {

-    const vp9_prob p = mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp;

-    offset += vp9_read(r, p);

-  } else {

-    offset += 1;  // If hp is not used, the default value of the hp bit is 1

-  }

-  mag = vp9_get_mv_mag(mv_class, offset);

-  return sign ? -(mag + 1) : (mag + 1);

+  // result

+  mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1;

+  return sign ? -mag : mag;

-static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,

-                     const nmv_context *mvctx) {

-  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);

-  mv->row = mv-> col = 0;

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);

-  }

-}

-static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,

-                        const nmv_context *mvctx, int usehp) {

-  const MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

-  usehp = usehp && vp9_use_nmv_hp(ref);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],

-                                    usehp);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],

-                                    usehp);

-  }

-  /*

-  printf("MV: %d %d REF: %d %d\n", mv->row + ref->row, mv->col + ref->col,

-	 ref->row, ref->col);

-	 */

-}

-static void update_nmv(vp9_reader *bc, vp9_prob *const p,

+static void update_nmv(vp9_reader *r, vp9_prob *const p,

                        const vp9_prob upd_p) {

-  if (vp9_read(bc, upd_p)) {

+  if (vp9_read(r, upd_p)) {

 #ifdef LOW_PRECISION_MV_UPDATE

-    *p = (vp9_read_literal(bc, 7) << 1) | 1;

+    *p = (vp9_read_literal(r, 7) << 1) | 1;

 #else

-    *p = (vp9_read_literal(bc, 8));

+    *p = (vp9_read_literal(r, 8));

 #endif

-static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,

+static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx,

                           int usehp) {

   int i, j, k;

 #ifdef MV_GROUP_UPDATE

-  if (!vp9_read_bit(bc))

+  if (!vp9_read_bit(r))

     return;

 #endif

   for (j = 0; j < MV_JOINTS - 1; ++j)

-    update_nmv(bc, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);

+    update_nmv(r, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);

   for (i = 0; i < 2; ++i) {

-    update_nmv(bc, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);

+    update_nmv(r, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);

     for (j = 0; j < MV_CLASSES - 1; ++j)

-      update_nmv(bc, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);

+      update_nmv(r, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);

     for (j = 0; j < CLASS0_SIZE - 1; ++j)

-      update_nmv(bc, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);

+      update_nmv(r, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);

     for (j = 0; j < MV_OFFSET_BITS; ++j)

-      update_nmv(bc, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);

+      update_nmv(r, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);

   for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j) {

+    for (j = 0; j < CLASS0_SIZE; ++j)

       for (k = 0; k < 3; ++k)

-        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);

-    }

+        update_nmv(r, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);

     for (j = 0; j < 3; ++j)

-      update_nmv(bc, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);

+      update_nmv(r, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);

   if (usehp) {

     for (i = 0; i < 2; ++i) {

-      update_nmv(bc, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);

-      update_nmv(bc, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);

+      update_nmv(r, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);

+      update_nmv(r, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);

 // Read the referncence frame

-static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,

-                                         vp9_reader *const bc,

-                                         unsigned char segment_id) {

-  MV_REFERENCE_FRAME ref_frame;

+static void read_ref_frame(VP9D_COMP *pbi, vp9_reader *r,

+                           int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {

   VP9_COMMON *const cm = &pbi->common;

   MACROBLOCKD *const xd = &pbi->mb;

+  const int seg_ref_active = vp9_segfeature_active(xd, segment_id,

+                                                   SEG_LVL_REF_FRAME);

-  int seg_ref_count = 0;

-  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

+  // Segment reference frame features not available.

+  if (!seg_ref_active) {

+    int is_comp;

+    int comp_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_INTER_INTER);

-  // If segment coding enabled does the segment allow for more than one

-  // possible reference frame

-  if (seg_ref_active) {

-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +

-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +

-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

-  }

+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {

+      is_comp = vp9_read(r, cm->fc.comp_inter_prob[comp_ctx]);

+      cm->fc.comp_inter_count[comp_ctx][is_comp]++;

+    } else {

+      is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;

+    }

-  // Segment reference frame features not available or allows for

-  // multiple reference frame options

-  if (!seg_ref_active || (seg_ref_count > 1)) {

-    // Values used in prediction model coding

-    MV_REFERENCE_FRAME pred_ref;

+    // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding

+    if (is_comp) {

+      int b, fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];

+      int ref_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_REF_P);

-    // Get the context probability the prediction flag

-    vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

-    // Read the prediction status flag

-    unsigned char prediction_flag = vp9_read(bc, pred_prob);

-    // Store the prediction flag.

-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);

-    // Get the predicted reference frame.

-    pred_ref = vp9_get_pred_ref(cm, xd);

-    // If correctly predicted then use the predicted value

-    if (prediction_flag) {

-      ref_frame = pred_ref;

+      ref_frame[fix_ref_idx]  = cm->comp_fixed_ref;

+      b = vp9_read(r, cm->fc.comp_ref_prob[ref_ctx]);

+      cm->fc.comp_ref_count[ref_ctx][b]++;

+      ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];

     } else {

-      // decode the explicitly coded value

-      vp9_prob mod_refprobs[PREDICTION_PROBS];

-      vpx_memcpy(mod_refprobs,

-                 cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));

-      // If segment coding enabled blank out options that cant occur by

-      // setting the branch probability to 0.

-      if (seg_ref_active) {

-        mod_refprobs[INTRA_FRAME] *=

-          vp9_check_segref(xd, segment_id, INTRA_FRAME);

-        mod_refprobs[LAST_FRAME] *=

-          vp9_check_segref(xd, segment_id, LAST_FRAME);

-        mod_refprobs[GOLDEN_FRAME] *=

-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *

-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));

+      int ref1_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1);

+      ref_frame[1] = NONE;

+      if (vp9_read(r, cm->fc.single_ref_prob[ref1_ctx][0])) {

+        int ref2_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2);

+        int b2 = vp9_read(r, cm->fc.single_ref_prob[ref2_ctx][1]);

+        ref_frame[0] = b2 ? ALTREF_FRAME : GOLDEN_FRAME;

+        cm->fc.single_ref_count[ref1_ctx][0][1]++;

+        cm->fc.single_ref_count[ref2_ctx][1][b2]++;

+      } else {

+        ref_frame[0] = LAST_FRAME;

+        cm->fc.single_ref_count[ref1_ctx][0][0]++;

-      // Default to INTRA_FRAME (value 0)

-      ref_frame = INTRA_FRAME;

-      // Do we need to decode the Intra/Inter branch

-      if (mod_refprobs[0])

-        ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);

-      else

-        ref_frame++;

-      if (ref_frame) {

-        // Do we need to decode the Last/Gf_Arf branch

-        if (mod_refprobs[1])

-          ref_frame += vp9_read(bc, mod_refprobs[1]);

-        else

-          ref_frame++;

-        if (ref_frame > 1) {

-          // Do we need to decode the GF/Arf branch

-          if (mod_refprobs[2])

-            ref_frame += vp9_read(bc, mod_refprobs[2]);

-          else {

-            if (seg_ref_active) {

-              if ((pred_ref == GOLDEN_FRAME) ||

-                  !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {

-                ref_frame = ALTREF_FRAME;

-              } else

-                ref_frame = GOLDEN_FRAME;

-            } else

-              ref_frame = (pred_ref == GOLDEN_FRAME)

-                          ? ALTREF_FRAME : GOLDEN_FRAME;

-          }

-        }

-      }

   } else {

-    // Segment reference frame features are enabled

-    // The reference frame for the mb is considered as correclty predicted

-    // if it is signaled at the segment level for the purposes of the

-    // common prediction model

-    vp9_set_pred_flag(xd, PRED_REF, 1);

-    ref_frame = vp9_get_pred_ref(cm, xd);

+    ref_frame[0] = vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME);

+    ref_frame[1] = NONE;

-  return (MV_REFERENCE_FRAME)ref_frame;

-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);

+static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *r, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE) treed_read(r, vp9_sb_mv_ref_tree, p);

-static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);

-}

-static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {

-  return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);

-}

 #ifdef VPX_MODE_COUNT

 unsigned int vp9_mv_cont_count[5][4] = {

   { 0, 0, 0, 0 },

@@ -476,79 +303,103 @@

};

 #endif

-static const unsigned char mbsplit_fill_count[4] = { 8, 8, 4, 1 };

-static const unsigned char mbsplit_fill_offset[4][16] = {

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 },

-  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15 },

-  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15 },

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 }

-};

-static void read_switchable_interp_probs(VP9D_COMP* const pbi,

-                                         BOOL_DECODER* const bc) {

-  VP9_COMMON *const cm = &pbi->common;

+static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) {

   int i, j;

-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)

     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

-      cm->fc.switchable_interp_prob[j][i] = vp9_read_prob(bc);

+      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {

+        cm->fc.switchable_interp_prob[j][i] =

+            // vp9_read_prob(r);

+            vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]);

+      }

-  }

-  //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],

-  //cm->fc.switchable_interp_prob[1]);

-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {

+static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) {

+  int i, j;

+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)

+    for (j = 0; j < VP9_INTER_MODES - 1; ++j) {

+      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {

+        // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r);

+        cm->fc.inter_mode_probs[i][j] =

+            vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]);

+      }

+    }

+}

+static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {

+  COMPPREDMODE_TYPE mode = vp9_read_bit(r);

+  if (mode)

+     mode += vp9_read_bit(r);

+  return mode;

+}

+static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {

   VP9_COMMON *const cm = &pbi->common;

-  nmv_context *const nmvc = &pbi->common.fc.nmvc;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  if (cm->frame_type == KEY_FRAME) {

-    if (!cm->kf_ymode_probs_update)

-      cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);

-  } else {

-    if (cm->mcomp_filter_type == SWITCHABLE)

-      read_switchable_interp_probs(pbi, bc);

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (cm->use_interintra) {

-      if (vp9_read(bc, VP9_UPD_INTERINTRA_PROB))

-        cm->fc.interintra_prob = vp9_read_prob(bc);

-    }

-#endif

-    // Decode the baseline probabilities for decoding reference frame

-    cm->prob_intra_coded = vp9_read_prob(bc);

-    cm->prob_last_coded  = vp9_read_prob(bc);

-    cm->prob_gf_coded    = vp9_read_prob(bc);

+  if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) {

+    nmv_context *const nmvc = &pbi->common.fc.nmvc;

+    MACROBLOCKD *const xd = &pbi->mb;

+    int i, j;

-    // Computes a modified set of probabilities for use when reference

-    // frame prediction fails.

-    vp9_compute_mod_refprobs(cm);

+    read_inter_mode_probs(cm, r);

-    pbi->common.comp_pred_mode = vp9_read(bc, 128);

-    if (cm->comp_pred_mode)

-      cm->comp_pred_mode += vp9_read(bc, 128);

-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {

-      int i;

-      for (i = 0; i < COMP_PRED_CONTEXTS; i++)

-        cm->prob_comppred[i] = vp9_read_prob(bc);

+    if (cm->mcomp_filter_type == SWITCHABLE)

+      read_switchable_interp_probs(cm, r);

+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++) {

+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+        cm->fc.intra_inter_prob[i] =

+            vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);

-    if (vp9_read_bit(bc)) {

-      int i = 0;

-      do {

-        cm->fc.ymode_prob[i] = vp9_read_prob(bc);

-      } while (++i < VP9_YMODES - 1);

+    if (cm->allow_comp_inter_inter) {

+      cm->comp_pred_mode = read_comp_pred_mode(r);

+      if (cm->comp_pred_mode == HYBRID_PREDICTION)

+        for (i = 0; i < COMP_INTER_CONTEXTS; i++)

+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+            cm->fc.comp_inter_prob[i] =

+                vp9_read_prob_diff_update(r, cm->fc.comp_inter_prob[i]);

+    } else {

+      cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;

-    if (vp9_read_bit(bc)) {

-      int i = 0;

+    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)

+      for (i = 0; i < REF_CONTEXTS; i++) {

+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+          cm->fc.single_ref_prob[i][0] =

+              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][0]);

+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+          cm->fc.single_ref_prob[i][1] =

+              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][1]);

+      }

-      do {

-        cm->fc.sb_ymode_prob[i] = vp9_read_prob(bc);

-      } while (++i < VP9_I32X32_MODES - 1);

+    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)

+      for (i = 0; i < REF_CONTEXTS; i++)

+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+          cm->fc.comp_ref_prob[i] =

+              vp9_read_prob_diff_update(r, cm->fc.comp_ref_prob[i]);

+    // VP9_INTRA_MODES

+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {

+      for (i = 0; i < VP9_INTRA_MODES - 1; ++i) {

+        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {

+          cm->fc.y_mode_prob[j][i] =

+              vp9_read_prob_diff_update(r, cm->fc.y_mode_prob[j][i]);

+        }

+      }

+    for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) {

+      for (i = 0; i < PARTITION_TYPES - 1; ++i) {

+        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {

+          cm->fc.partition_prob[INTER_FRAME][j][i] =

+              vp9_read_prob_diff_update(r,

+                  cm->fc.partition_prob[INTER_FRAME][j][i]);

+        }

+      }

+    }

-    read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);

+    read_nmvprobs(r, nmvc, xd->allow_high_precision_mv);

@@ -555,80 +406,40 @@

 // This function either reads the segment id for the current macroblock from

 // the bitstream or if the value is temporally predicted asserts the predicted

 // value

-static void read_mb_segment_id(VP9D_COMP *pbi,

-                               int mb_row, int mb_col,

-                               BOOL_DECODER* const bc) {

+static int read_mb_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,

+                              vp9_reader *r) {

   VP9_COMMON *const cm = &pbi->common;

   MACROBLOCKD *const xd = &pbi->mb;

-  MODE_INFO *mi = xd->mode_info_context;

-  MB_MODE_INFO *mbmi = &mi->mbmi;

-  int mb_index = mb_row * pbi->common.mb_cols + mb_col;

+  MODE_INFO *const mi = xd->mode_info_context;

+  MB_MODE_INFO *const mbmi = &mi->mbmi;

-  if (xd->segmentation_enabled) {

-    if (xd->update_mb_segmentation_map) {

-      // Is temporal coding of the segment id for this mb enabled.

-      if (cm->temporal_update) {

-        // Get the context based probability for reading the

-        // prediction status flag

-        vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);

+  if (!xd->segmentation_enabled)

+    return 0;  // Default for disabled segmentation

-        // Read the prediction status flag

-        unsigned char seg_pred_flag = vp9_read(bc, pred_prob);

+  if (xd->update_mb_segmentation_map) {

+    int segment_id;

-        // Store the prediction flag.

-        vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);

+    if (cm->temporal_update) {

+      // Temporal coding of the segment id for this mb is enabled.

+      // Get the context based probability for reading the

+      // prediction status flag

+      const vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);

+      const int pred_flag = vp9_read(r, pred_prob);

+      vp9_set_pred_flag(xd, PRED_SEG_ID, pred_flag);

-        // If the value is flagged as correctly predicted

-        // then use the predicted value

-        if (seg_pred_flag) {

-          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, mb_index);

-        } else {

-          // Decode it explicitly

-          read_mb_segid_except(cm, bc, mbmi, xd, mb_row, mb_col);

-        }

-      } else {

-        // Normal unpredicted coding mode

-        read_mb_segid(bc, mbmi, xd);

-      }

-      if (mbmi->sb_type) {

-        const int nmbs = 1 << mbmi->sb_type;

-        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);

-        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);

-        int x, y;

-        for (y = 0; y < ymbs; y++) {

-          for (x = 0; x < xmbs; x++) {

-            cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols] =

-                mbmi->segment_id;

-          }

-        }

-      } else {

-        cm->last_frame_seg_map[mb_index] = mbmi->segment_id;

-      }

+      // If the value is flagged as correctly predicted

+      // then use the predicted value, otherwise decode it explicitly

+      segment_id = pred_flag ? vp9_get_pred_mi_segid(cm, mbmi->sb_type,

+                                                     mi_row, mi_col)

+                             : read_mb_segid(r, xd);

     } else {

-      if (mbmi->sb_type) {

-        const int nmbs = 1 << mbmi->sb_type;

-        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);

-        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);

-        unsigned segment_id = -1;

-        int x, y;

-        for (y = 0; y < ymbs; y++) {

-          for (x = 0; x < xmbs; x++) {

-            segment_id = MIN(segment_id,

-                cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols]);

-          }

-        }

-        mbmi->segment_id = segment_id;

-      } else {

-        mbmi->segment_id = cm->last_frame_seg_map[mb_index];

-      }

+      segment_id = read_mb_segid(r, xd);  // Normal unpredicted coding mode

+    set_segment_id(cm, mbmi, mi_row, mi_col, segment_id);  // Side effect

+    return segment_id;

   } else {

-    // The encoder explicitly sets the segment_id to 0

-    // when segmentation is disabled

-    mbmi->segment_id = 0;

+    return vp9_get_pred_mi_segid(cm, mbmi->sb_type, mi_row, mi_col);

@@ -643,48 +454,66 @@

            mb_to_bottom_edge);

-static INLINE void process_mv(BOOL_DECODER* bc, MV *mv, MV *ref,

-                              nmv_context *nmvc, nmv_context_counts *mvctx,

-                              int usehp) {

-  read_nmv(bc, mv, ref, nmvc);

-  read_nmv_fp(bc, mv, ref, nmvc, usehp);

-  vp9_increment_nmv(mv, ref, mvctx, usehp);

-  mv->row += ref->row;

-  mv->col += ref->col;

+static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,

+                             const nmv_context *ctx,

+                             nmv_context_counts *counts,

+                             int usehp) {

+  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);

+  MV diff = {0, 0};

+  usehp = usehp && vp9_use_nmv_hp(ref);

+  if (mv_joint_vertical(j))

+    diff.row = read_mv_component(r, &ctx->comps[0], usehp);

+  if (mv_joint_horizontal(j))

+    diff.col = read_mv_component(r, &ctx->comps[1], usehp);

+  vp9_increment_nmv(&diff, ref, counts, usehp);

+  mv->row = diff.row + ref->row;

+  mv->col = diff.col + ref->col;

+static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(

+    VP9D_COMP *pbi, vp9_reader *r) {

+  const int index = treed_read(r, vp9_switchable_interp_tree,

+                               vp9_get_pred_probs(&pbi->common, &pbi->mb,

+                                                  PRED_SWITCHABLE_INTERP));

+  ++pbi->common.fc.switchable_interp_count

+                [vp9_get_pred_context(

+                    &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index];

+  return vp9_switchable_interp[index];

+}

 static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,

-                             MODE_INFO *prev_mi,

-                             int mb_row, int mb_col,

-                             BOOL_DECODER* const bc) {

+                             int mi_row, int mi_col,

+                             vp9_reader *r) {

   VP9_COMMON *const cm = &pbi->common;

-  nmv_context *const nmvc = &pbi->common.fc.nmvc;

-  const int mis = pbi->common.mode_info_stride;

+  nmv_context *const nmvc = &cm->fc.nmvc;

   MACROBLOCKD *const xd = &pbi->mb;

-  int_mv *const mv = &mbmi->mv[0];

-  const int mb_size = 1 << mi->mbmi.sb_type;

+  int_mv *const mv0 = &mbmi->mv[0];

+  int_mv *const mv1 = &mbmi->mv[1];

+  BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;

+  int bw = 1 << b_width_log2(bsize);

+  int bh = 1 << b_height_log2(bsize);

-  const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&

-                                       cm->height == cm->last_height &&

-                                       !cm->error_resilient_mode;

   int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;

+  int j, idx, idy;

   mbmi->need_to_clamp_mvs = 0;

   mbmi->need_to_clamp_secondmv = 0;

-  mbmi->second_ref_frame = NONE;

+  mbmi->ref_frame[1] = NONE;

   // Make sure the MACROBLOCKD mode info pointer is pointed at the

   // correct entry for the current macroblock.

   xd->mode_info_context = mi;

-  xd->prev_mode_info_context = prev_mi;

   // Distance of Mb to the various image edges.

   // These specified to 8th pel as they are always compared to MV values

   // that are in 1/8th pel units

-  set_mb_row(cm, xd, mb_row, mb_size);

-  set_mb_col(cm, xd, mb_col, mb_size);

+  set_mi_row_col(cm, xd, mi_row, 1 << mi_height_log2(bsize),

+                         mi_col, 1 << mi_width_log2(bsize));

   mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;

   mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;

@@ -692,81 +521,78 @@

   mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;

   // Read the macroblock segment id.

-  read_mb_segment_id(pbi, mb_row, mb_col, bc);

+  mbmi->segment_id = read_mb_segment_id(pbi, mi_row, mi_col, r);

-  if (pbi->common.mb_no_coeff_skip &&

-      (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP))) {

-    // Read the macroblock coeff skip flag if this feature is in use,

-    // else default to 0

-    mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

-  } else {

-    mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,

-                                                SEG_LVL_SKIP);

+  mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,

+                                              SEG_LVL_SKIP);

+  if (!mbmi->mb_skip_coeff) {

+    mbmi->mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

+    cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]

+                       [mbmi->mb_skip_coeff]++;

   // Read the reference frame

-  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);

+  if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) {

+    mbmi->ref_frame[0] =

+        vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));

+    cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]

+                            [mbmi->ref_frame[0] != INTRA_FRAME]++;

+  } else {

+    mbmi->ref_frame[0] =

+        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;

+  }

-  /*

-  if (pbi->common.current_video_frame == 1)

-    printf("ref frame: %d [%d %d]\n", mbmi->ref_frame, mb_row, mb_col);

-    */

+  if (cm->txfm_mode == TX_MODE_SELECT &&

+      (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) &&

+      bsize >= BLOCK_SIZE_SB8X8) {

+    mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize);

+  } else if (bsize >= BLOCK_SIZE_SB32X32 &&

+             cm->txfm_mode >= ALLOW_32X32) {

+    mbmi->txfm_size = TX_32X32;

+  } else if (cm->txfm_mode >= ALLOW_16X16 &&

+             bsize >= BLOCK_SIZE_MB16X16) {

+    mbmi->txfm_size = TX_16X16;

+  } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) {

+    mbmi->txfm_size = TX_8X8;

+  } else {

+    mbmi->txfm_size = TX_4X4;

+  }

   // If reference frame is an Inter frame

-  if (mbmi->ref_frame) {

+  if (mbmi->ref_frame[0] != INTRA_FRAME) {

     int_mv nearest, nearby, best_mv;

     int_mv nearest_second, nearby_second, best_mv_second;

-    vp9_prob mv_ref_p[VP9_MVREFS - 1];

+    vp9_prob mv_ref_p[VP9_INTER_MODES - 1];

-    MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

-    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];

+    read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame);

-      const int use_prev_in_find_best_ref =

-          xd->scale_factor[0].x_num == xd->scale_factor[0].x_den &&

-          xd->scale_factor[0].y_num == xd->scale_factor[0].y_den &&

-          !cm->error_resilient_mode &&

-          !cm->frame_parallel_decoding_mode;

-      /* Select the appropriate reference frame for this MB */

-      const int ref_fb_idx = cm->active_ref_idx[ref_frame - 1];

-      setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx],

-          mb_row, mb_col, &xd->scale_factor[0], &xd->scale_factor_uv[0]);

 #ifdef DEC_DEBUG

       if (dec_debug)

         printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,

                xd->mode_info_context->mbmi.mv[0].as_mv.col);

 #endif

-      // if (cm->current_video_frame == 1 && mb_row == 4 && mb_col == 5)

-      //  printf("Dello\n");

-      vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,

-                       ref_frame, mbmi->ref_mvs[ref_frame],

+      vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,

+                       mbmi->ref_frame[0], mbmi->ref_mvs[mbmi->ref_frame[0]],

                        cm->ref_frame_sign_bias);

-      vp9_mv_ref_probs(&pbi->common, mv_ref_p,

-                       mbmi->mb_mode_context[ref_frame]);

+      vp9_mv_ref_probs(cm, mv_ref_p, mbmi->mb_mode_context[mbmi->ref_frame[0]]);

       // If the segment level skip mode enabled

       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {

         mbmi->mode = ZEROMV;

-      } else {

-        mbmi->mode = mbmi->sb_type ? read_sb_mv_ref(bc, mv_ref_p)

-                                   : read_mv_ref(bc, mv_ref_p);

-        vp9_accum_mv_refs(&pbi->common, mbmi->mode,

-                          mbmi->mb_mode_context[ref_frame]);

+      } else if (bsize >= BLOCK_SIZE_SB8X8) {

+        mbmi->mode = read_sb_mv_ref(r, mv_ref_p);

+        vp9_accum_mv_refs(cm, mbmi->mode,

+                          mbmi->mb_mode_context[mbmi->ref_frame[0]]);

-      if (mbmi->mode != ZEROMV) {

+      if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {

         vp9_find_best_ref_mvs(xd,

-                              use_prev_in_find_best_ref ?

-                                  xd->pre.y_buffer : NULL,

-                              xd->pre.y_stride,

-                              mbmi->ref_mvs[ref_frame],

+                              mbmi->ref_mvs[mbmi->ref_frame[0]],

                               &nearest, &nearby);

-        best_mv.as_int = (mbmi->ref_mvs[ref_frame][0]).as_int;

+        best_mv.as_int = mbmi->ref_mvs[mbmi->ref_frame[0]][0].as_int;

 #ifdef DEC_DEBUG

@@ -777,176 +603,79 @@

 #endif

-    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV) {

-      if (cm->mcomp_filter_type == SWITCHABLE) {

-        mbmi->interp_filter = vp9_switchable_interp[

-            treed_read(bc, vp9_switchable_interp_tree,

-                       vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];

-      } else {

-        mbmi->interp_filter = cm->mcomp_filter_type;

-      }

-    }

+    mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE

+                              ? read_switchable_filter_type(pbi, r)

+                              : cm->mcomp_filter_type;

-    if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||

-        (cm->comp_pred_mode == HYBRID_PREDICTION &&

-         vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {

-      /* Since we have 3 reference frames, we can only have 3 unique

-       * combinations of combinations of 2 different reference frames

-       * (A-G, G-L or A-L). In the bitstream, we use this to simply

-       * derive the second reference frame from the first reference

-       * frame, by saying it's the next one in the enumerator, and

-       * if that's > n_refs, then the second reference frame is the

-       * first one in the enumerator. */

-      mbmi->second_ref_frame = mbmi->ref_frame + 1;

-      if (mbmi->second_ref_frame == 4)

-        mbmi->second_ref_frame = 1;

-      if (mbmi->second_ref_frame > 0) {

-        int second_ref_fb_idx;

-        int use_prev_in_find_best_ref;

+    if (mbmi->ref_frame[1] > INTRA_FRAME) {

+      vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,

+                       mbmi->ref_frame[1],

+                       mbmi->ref_mvs[mbmi->ref_frame[1]],

+                       cm->ref_frame_sign_bias);

-        xd->scale_factor[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1];

-        use_prev_in_find_best_ref =

-            xd->scale_factor[1].x_num == xd->scale_factor[1].x_den &&

-            xd->scale_factor[1].y_num == xd->scale_factor[1].y_den &&

-            !cm->error_resilient_mode &&

-            !cm->frame_parallel_decoding_mode;

-        /* Select the appropriate reference frame for this MB */

-        second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];

-        setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],

-             mb_row, mb_col, &xd->scale_factor[1], &xd->scale_factor_uv[1]);

-        vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,

-                         mbmi->second_ref_frame,

-                         mbmi->ref_mvs[mbmi->second_ref_frame],

-                         cm->ref_frame_sign_bias);

-        if (mbmi->mode != ZEROMV) {

-          vp9_find_best_ref_mvs(xd,

-                                use_prev_in_find_best_ref ?

-                                    xd->second_pre.y_buffer : NULL,

-                                xd->second_pre.y_stride,

-                                mbmi->ref_mvs[mbmi->second_ref_frame],

-                                &nearest_second,

-                                &nearby_second);

-          best_mv_second = mbmi->ref_mvs[mbmi->second_ref_frame][0];

-        }

+      if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {

+        vp9_find_best_ref_mvs(xd,

+                              mbmi->ref_mvs[mbmi->ref_frame[1]],

+                              &nearest_second,

+                              &nearby_second);

+        best_mv_second.as_int = mbmi->ref_mvs[mbmi->ref_frame[1]][0].as_int;

-    } else {

-#if CONFIG_COMP_INTERINTRA_PRED

-      if (pbi->common.use_interintra &&

-          mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&

-          mbmi->second_ref_frame == NONE) {

-        mbmi->second_ref_frame = (vp9_read(bc, pbi->common.fc.interintra_prob) ?

-                                  INTRA_FRAME : NONE);

-        // printf("-- %d (%d)\n", mbmi->second_ref_frame == INTRA_FRAME,

-        //        pbi->common.fc.interintra_prob);

-        pbi->common.fc.interintra_counts[

-            mbmi->second_ref_frame == INTRA_FRAME]++;

-        if (mbmi->second_ref_frame == INTRA_FRAME) {

-          mbmi->interintra_mode = read_ymode(bc, pbi->common.fc.ymode_prob);

-          pbi->common.fc.ymode_counts[mbmi->interintra_mode]++;

-#if SEPARATE_INTERINTRA_UV

-          mbmi->interintra_uv_mode = read_uv_mode(bc,

-              pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]);

-          pbi->common.fc.uv_mode_counts[mbmi->interintra_mode]

-                                       [mbmi->interintra_uv_mode]++;

-#else

-          mbmi->interintra_uv_mode = mbmi->interintra_mode;

-#endif

-          // printf("** %d %d\n",

-          //        mbmi->interintra_mode, mbmi->interintra_uv_mode);

-        }

-      }

-#endif

-#if CONFIG_NEW_MVREF

-    // if ((mbmi->mode == NEWMV) || (mbmi->mode == SPLITMV))

-    if (mbmi->mode == NEWMV) {

-      int best_index;

-      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

-      // Encode the index of the choice.

-      best_index =

-        vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);

-      best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;

-      if (mbmi->second_ref_frame > 0) {

-        ref_frame = mbmi->second_ref_frame;

-        // Encode the index of the choice.

-        best_index =

-          vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);

-        best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;

-      }

-    }

-#endif

     mbmi->uv_mode = DC_PRED;

-    switch (mbmi->mode) {

-      case SPLITMV: {

-        const int s = treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);

-        const int num_p = vp9_mbsplit_count[s];

-        int j = 0;

-        cm->fc.mbsplit_counts[s]++;

-        mbmi->need_to_clamp_mvs = 0;

-        mbmi->partitioning = s;

-        do {  // for each subset j

-          int_mv leftmv, abovemv, second_leftmv, second_abovemv;

+    if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {

+      mbmi->need_to_clamp_mvs = 0;

+      for (idy = 0; idy < 2; idy += bh) {

+        for (idx = 0; idx < 2; idx += bw) {

           int_mv blockmv, secondmv;

-          int mv_contz;

           int blockmode;

-          int k = vp9_mbsplit_offset[s][j];  // first block in subset j

+          int i;

+          j = idy * 2 + idx;

-          leftmv.as_int = left_block_mv(xd, mi, k);

-          abovemv.as_int = above_block_mv(mi, k, mis);

-          second_leftmv.as_int = 0;

-          second_abovemv.as_int = 0;

-          if (mbmi->second_ref_frame > 0) {

-            second_leftmv.as_int = left_block_second_mv(xd, mi, k);

-            second_abovemv.as_int = above_block_second_mv(mi, k, mis);

+          blockmode = read_sb_mv_ref(r, mv_ref_p);

+          vp9_accum_mv_refs(cm, blockmode,

+                            mbmi->mb_mode_context[mbmi->ref_frame[0]]);

+          if (blockmode == NEARESTMV || blockmode == NEARMV) {

+            MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];

+            vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0);

+            if (rf2 > 0) {

+              vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,

+                                            &nearby_second, j, 1);

+            }

-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);

-          blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);

-          cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;

           switch (blockmode) {

-            case NEW4X4:

-              process_mv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,

+            case NEWMV:

+              decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,

                          &cm->fc.NMVcount, xd->allow_high_precision_mv);

-              if (mbmi->second_ref_frame > 0)

-                process_mv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,

-                           &cm->fc.NMVcount, xd->allow_high_precision_mv);

+              if (mbmi->ref_frame[1] > 0)

+                decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,

+                          &cm->fc.NMVcount, xd->allow_high_precision_mv);

 #ifdef VPX_MODE_COUNT

               vp9_mv_cont_count[mv_contz][3]++;

 #endif

               break;

-            case LEFT4X4:

-              blockmv.as_int = leftmv.as_int;

-              if (mbmi->second_ref_frame > 0)

-                secondmv.as_int = second_leftmv.as_int;

+            case NEARESTMV:

+              blockmv.as_int = nearest.as_int;

+              if (mbmi->ref_frame[1] > 0)

+                secondmv.as_int = nearest_second.as_int;

 #ifdef VPX_MODE_COUNT

               vp9_mv_cont_count[mv_contz][0]++;

 #endif

               break;

-            case ABOVE4X4:

-              blockmv.as_int = abovemv.as_int;

-              if (mbmi->second_ref_frame > 0)

-                secondmv.as_int = second_abovemv.as_int;

+            case NEARMV:

+              blockmv.as_int = nearby.as_int;

+              if (mbmi->ref_frame[1] > 0)

+                secondmv.as_int = nearby_second.as_int;

 #ifdef VPX_MODE_COUNT

               vp9_mv_cont_count[mv_contz][1]++;

 #endif

               break;

-            case ZERO4X4:

+            case ZEROMV:

               blockmv.as_int = 0;

-              if (mbmi->second_ref_frame > 0)

+              if (mbmi->ref_frame[1] > 0)

                 secondmv.as_int = 0;

 #ifdef VPX_MODE_COUNT

               vp9_mv_cont_count[mv_contz][2]++;

@@ -955,490 +684,154 @@

             default:

               break;

+          mi->bmi[j].as_mv[0].as_int = blockmv.as_int;

+          if (mbmi->ref_frame[1] > 0)

+            mi->bmi[j].as_mv[1].as_int = secondmv.as_int;

-          /*  Commenting this section out, not sure why this was needed, and

-           *  there are mismatches with this section in rare cases since it is

-           *  not done in the encoder at all.

-          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,

-                                                     mb_to_left_edge,

-                                                     mb_to_right_edge,

-                                                     mb_to_top_edge,

-                                                     mb_to_bottom_edge);

-          if (mbmi->second_ref_frame > 0) {

-            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,

-                                                       mb_to_left_edge,

-                                                       mb_to_right_edge,

-                                                       mb_to_top_edge,

-                                                       mb_to_bottom_edge);

-          }

-          */

-          {

-            /* Fill (uniform) modes, mvs of jth subset.

-             Must do it here because ensuing subsets can

-             refer back to us via "left" or "above". */

-            unsigned int fill_count = mbsplit_fill_count[s];

-            const unsigned char *fill_offset =

-                &mbsplit_fill_offset[s][j * fill_count];

-            do {

-              mi->bmi[*fill_offset].as_mv[0].as_int = blockmv.as_int;

-              if (mbmi->second_ref_frame > 0)

-                mi->bmi[*fill_offset].as_mv[1].as_int = secondmv.as_int;

-              fill_offset++;

-            } while (--fill_count);

-          }

-        } while (++j < num_p);

+          for (i = 1; i < bh; ++i)

+            vpx_memcpy(&mi->bmi[j + i * 2], &mi->bmi[j], sizeof(mi->bmi[j]));

+          for (i = 1; i < bw; ++i)

+            vpx_memcpy(&mi->bmi[j + i], &mi->bmi[j], sizeof(mi->bmi[j]));

+          mi->mbmi.mode = blockmode;

+        }

-      mv->as_int = mi->bmi[15].as_mv[0].as_int;

-      mbmi->mv[1].as_int = mi->bmi[15].as_mv[1].as_int;

-      break;  /* done with SPLITMV */

-      case NEARMV:

-        // Clip "next_nearest" so that it does not extend to far out of image

-        assign_and_clamp_mv(mv, &nearby, mb_to_left_edge,

-                                         mb_to_right_edge,

-                                         mb_to_top_edge,

-                                         mb_to_bottom_edge);

-        if (mbmi->second_ref_frame > 0)

-          assign_and_clamp_mv(&mbmi->mv[1], &nearby_second, mb_to_left_edge,

-                                                            mb_to_right_edge,

-                                                            mb_to_top_edge,

-                                                            mb_to_bottom_edge);

-        break;

+      mv0->as_int = mi->bmi[3].as_mv[0].as_int;

+      mv1->as_int = mi->bmi[3].as_mv[1].as_int;

+    } else {

+      switch (mbmi->mode) {

+        case NEARMV:

+          // Clip "next_nearest" so that it does not extend to far out of image

+          assign_and_clamp_mv(mv0, &nearby, mb_to_left_edge,

+                                            mb_to_right_edge,

+                                            mb_to_top_edge,

+                                            mb_to_bottom_edge);

+          if (mbmi->ref_frame[1] > 0)

+            assign_and_clamp_mv(mv1, &nearby_second, mb_to_left_edge,

+                                                     mb_to_right_edge,

+                                                     mb_to_top_edge,

+                                                     mb_to_bottom_edge);

+          break;

-      case NEARESTMV:

-        // Clip "next_nearest" so that it does not extend to far out of image

-        assign_and_clamp_mv(mv, &nearest, mb_to_left_edge,

-                                          mb_to_right_edge,

-                                          mb_to_top_edge,

-                                          mb_to_bottom_edge);

-        if (mbmi->second_ref_frame > 0)

-          assign_and_clamp_mv(&mbmi->mv[1], &nearest_second, mb_to_left_edge,

-                                                             mb_to_right_edge,

-                                                             mb_to_top_edge,

-                                                             mb_to_bottom_edge);

-        break;

+        case NEARESTMV:

+          // Clip "next_nearest" so that it does not extend to far out of image

+          assign_and_clamp_mv(mv0, &nearest, mb_to_left_edge,

+                                             mb_to_right_edge,

+                                             mb_to_top_edge,

+                                             mb_to_bottom_edge);

+          if (mbmi->ref_frame[1] > 0)

+            assign_and_clamp_mv(mv1, &nearest_second, mb_to_left_edge,

+                                                      mb_to_right_edge,

+                                                      mb_to_top_edge,

+                                                      mb_to_bottom_edge);

+          break;

-      case ZEROMV:

-        mv->as_int = 0;

-        if (mbmi->second_ref_frame > 0)

-          mbmi->mv[1].as_int = 0;

-        break;

+        case ZEROMV:

+          mv0->as_int = 0;

+          if (mbmi->ref_frame[1] > 0)

+            mv1->as_int = 0;

+          break;

-      case NEWMV:

-        process_mv(bc, &mv->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,

-                   xd->allow_high_precision_mv);

+        case NEWMV:

+          decode_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,

+                    xd->allow_high_precision_mv);

+          mbmi->need_to_clamp_mvs = check_mv_bounds(mv0,

+                                                    mb_to_left_edge,

+                                                    mb_to_right_edge,

+                                                    mb_to_top_edge,

+                                                    mb_to_bottom_edge);

-        // Don't need to check this on NEARMV and NEARESTMV modes

-        // since those modes clamp the MV. The NEWMV mode does not,

-        // so signal to the prediction stage whether special

-        // handling may be required.

-        mbmi->need_to_clamp_mvs = check_mv_bounds(mv,

-                                                  mb_to_left_edge,

-                                                  mb_to_right_edge,

-                                                  mb_to_top_edge,

-                                                  mb_to_bottom_edge);

-        if (mbmi->second_ref_frame > 0) {

-          process_mv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,

-                     &cm->fc.NMVcount, xd->allow_high_precision_mv);

-          mbmi->need_to_clamp_secondmv |= check_mv_bounds(&mbmi->mv[1],

-                                                          mb_to_left_edge,

-                                                          mb_to_right_edge,

-                                                          mb_to_top_edge,

-                                                          mb_to_bottom_edge);

-        }

-        break;

-      default:

-;

+          if (mbmi->ref_frame[1] > 0) {

+            decode_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,

+                      &cm->fc.NMVcount, xd->allow_high_precision_mv);

+            mbmi->need_to_clamp_secondmv = check_mv_bounds(mv1,

+                                                             mb_to_left_edge,

+                                                             mb_to_right_edge,

+                                                             mb_to_top_edge,

+                                                             mb_to_bottom_edge);

+          }

+          break;

+        default:

 #if CONFIG_DEBUG

-        assert(0);

+          assert(0);

 #endif

+          break;

+      }

   } else {

-    /* required for left and above block mv */

-    mbmi->mv[0].as_int = 0;

+    // required for left and above block mv

+    mv0->as_int = 0;

-    if (mbmi->sb_type) {

-      mbmi->mode = read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);

-      pbi->common.fc.sb_ymode_counts[mbmi->mode]++;

+    if (bsize >= BLOCK_SIZE_SB8X8) {

+      const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;

+      const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

+      const int bsl = MIN(bwl, bhl);

+      mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);

+      cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;

     } else {

-      mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob);

-      pbi->common.fc.ymode_counts[mbmi->mode]++;

-    }

-    // If MB mode is BPRED read the block modes

-    if (mbmi->mode == B_PRED) {

-      int j = 0;

-      do {

-        int m = read_bmode(bc, pbi->common.fc.bmode_prob);

-        mi->bmi[j].as_mode.first = m;

-#if CONFIG_NEWBINTRAMODES

-        if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;

-#endif

-        pbi->common.fc.bmode_counts[m]++;

-      } while (++j < 16);

-    }

-    if (mbmi->mode == I8X8_PRED) {

-      int i;

-      for (i = 0; i < 4; i++) {

-        const int ib = vp9_i8x8_block[i];

-        const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

-        mi->bmi[ib + 0].as_mode.first = mode8x8;

-        mi->bmi[ib + 1].as_mode.first = mode8x8;

-        mi->bmi[ib + 4].as_mode.first = mode8x8;

-        mi->bmi[ib + 5].as_mode.first = mode8x8;

-        pbi->common.fc.i8x8_mode_counts[mode8x8]++;

+      int idx, idy;

+      for (idy = 0; idy < 2; idy += bh) {

+        for (idx = 0; idx < 2; idx += bw) {

+          int ib = idy * 2 + idx, k;

+          int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);

+          mi->bmi[ib].as_mode.first = m;

+          cm->fc.y_mode_counts[0][m]++;

+          for (k = 1; k < bh; ++k)

+            mi->bmi[ib + k * 2].as_mode.first = m;

+          for (k = 1; k < bw; ++k)

+            mi->bmi[ib + k].as_mode.first = m;

+        }

-    } else {

-      mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);

-      pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;

+      mbmi->mode = mi->bmi[3].as_mode.first;

-  }

-  /*

-  if (pbi->common.current_video_frame == 1)

-    printf("mode: %d skip: %d\n", mbmi->mode, mbmi->mb_skip_coeff);

-    */

-  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&

-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||

-       (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&

-                           mbmi->partitioning == PARTITIONING_4X4)))) {

-    // FIXME(rbultje) code ternary symbol once all experiments are merged

-    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);

-    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&

-        mbmi->mode != SPLITMV) {

-      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);

-      if (mbmi->sb_type && mbmi->txfm_size != TX_8X8)

-        mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]);

-    }

-  } else if (mbmi->sb_type && cm->txfm_mode >= ALLOW_32X32) {

-    mbmi->txfm_size = TX_32X32;

-  } else if (cm->txfm_mode >= ALLOW_16X16 &&

-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||

-       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {

-    mbmi->txfm_size = TX_16X16;

-  } else if (cm->txfm_mode >= ALLOW_8X8 &&

-      (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&

-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&

-         mbmi->partitioning == PARTITIONING_4X4))) {

-    mbmi->txfm_size = TX_8X8;

-  } else {

-    mbmi->txfm_size = TX_4X4;

+    mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);

+    cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;

-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc) {

+void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {

   VP9_COMMON *cm = &pbi->common;

+  int k;

-  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));

-  if (pbi->common.mb_no_coeff_skip) {

-    int k;

-    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

-      cm->mbskip_pred_probs[k] = vp9_read_prob(bc);

+  // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.

+  // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));

+  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

+    if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {

+      cm->fc.mbskip_probs[k] =

+          vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);

+    // cm->fc.mbskip_probs[k] = vp9_read_prob(r);

-  mb_mode_mv_init(pbi, bc);

+  mb_mode_mv_init(pbi, r);

-#if CONFIG_CODE_NONZEROCOUNT

-static uint16_t read_nzc(VP9_COMMON *const cm,

-                         int nzc_context,

-                         TX_SIZE tx_size,

-                         int ref,

-                         int type,

-                         BOOL_DECODER* const bc) {

-  int c, e;

-  uint16_t nzc;

-  if (tx_size == TX_32X32) {

-    c = treed_read(bc, vp9_nzc32x32_tree,

-                   cm->fc.nzc_probs_32x32[nzc_context][ref][type]);

-    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;

-  } else if (tx_size == TX_16X16) {

-    c = treed_read(bc, vp9_nzc16x16_tree,

-                   cm->fc.nzc_probs_16x16[nzc_context][ref][type]);

-    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;

-  } else if (tx_size == TX_8X8) {

-    c = treed_read(bc, vp9_nzc8x8_tree,

-                   cm->fc.nzc_probs_8x8[nzc_context][ref][type]);

-    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;

-  } else if (tx_size == TX_4X4) {

-    c = treed_read(bc, vp9_nzc4x4_tree,

-                   cm->fc.nzc_probs_4x4[nzc_context][ref][type]);

-    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;

-  } else {

-    assert(0);

-  }

-  nzc = vp9_basenzcvalue[c];

-  if ((e = vp9_extranzcbits[c])) {

-    int x = 0;

-    while (e--) {

-      int b = vp9_read(

-          bc, cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);

-      x |= (b << e);

-      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;

-    }

-    nzc += x;

-  }

-  if (tx_size == TX_32X32)

-    assert(nzc <= 1024);

-  else if (tx_size == TX_16X16)

-    assert(nzc <= 256);

-  else if (tx_size == TX_8X8)

-    assert(nzc <= 64);

-  else if (tx_size == TX_4X4)

-    assert(nzc <= 16);

-  return nzc;

-}

-static void read_nzcs_sb64(VP9_COMMON *const cm,

-                           MACROBLOCKD* xd,

-                           int mb_row,

-                           int mb_col,

-                           BOOL_DECODER* const bc) {

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_32X32:

-      for (j = 0; j < 256; j += 64) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);

-      }

-      for (j = 256; j < 384; j += 64) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 1, bc);

-      }

-      break;

-    case TX_16X16:

-      for (j = 0; j < 256; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);

-      }

-      for (j = 256; j < 384; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);

-      }

-      break;

-    case TX_8X8:

-      for (j = 0; j < 256; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);

-      }

-      for (j = 256; j < 384; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

-      }

-      break;

-    case TX_4X4:

-      for (j = 0; j < 256; ++j) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);

-      }

-      for (j = 256; j < 384; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

-      }

-      break;

-    default:

-      break;

-  }

-}

-static void read_nzcs_sb32(VP9_COMMON *const cm,

-                           MACROBLOCKD* xd,

-                           int mb_row,

-                           int mb_col,

-                           BOOL_DECODER* const bc) {

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_32X32:

-      for (j = 0; j < 64; j += 64) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);

-      }

-      for (j = 64; j < 96; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);

-      }

-      break;

-    case TX_16X16:

-      for (j = 0; j < 64; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);

-      }

-      for (j = 64; j < 96; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);

-      }

-      break;

-    case TX_8X8:

-      for (j = 0; j < 64; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);

-      }

-      for (j = 64; j < 96; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

-      }

-      break;

-    case TX_4X4:

-      for (j = 0; j < 64; ++j) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);

-      }

-      for (j = 64; j < 96; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

-      }

-      break;

-    default:

-      break;

-  }

-}

-static void read_nzcs_mb16(VP9_COMMON *const cm,

-                           MACROBLOCKD* xd,

-                           int mb_row,

-                           int mb_col,

-                           BOOL_DECODER* const bc) {

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_16X16:

-      for (j = 0; j < 16; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);

-      }

-      for (j = 16; j < 24; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

-      }

-      break;

-    case TX_8X8:

-      for (j = 0; j < 16; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);

-      }

-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {

-        for (j = 16; j < 24; ++j) {

-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

-        }

-      } else {

-        for (j = 16; j < 24; j += 4) {

-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

-        }

-      }

-      break;

-    case TX_4X4:

-      for (j = 0; j < 16; ++j) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);

-      }

-      for (j = 16; j < 24; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

-      }

-      break;

-    default:

-      break;

-  }

-}

-#endif  // CONFIG_CODE_NONZEROCOUNT

 void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,

                            MACROBLOCKD* const xd,

-                           int mb_row,

-                           int mb_col,

-                           BOOL_DECODER* const bc) {

+                           int mi_row,

+                           int mi_col,

+                           vp9_reader *r) {

   VP9_COMMON *const cm = &pbi->common;

   MODE_INFO *mi = xd->mode_info_context;

-  MODE_INFO *prev_mi = xd->prev_mode_info_context;

   MB_MODE_INFO *const mbmi = &mi->mbmi;

-  if (pbi->common.frame_type == KEY_FRAME) {

-    kfread_modes(pbi, mi, mb_row, mb_col, bc);

+  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {

+    kfread_modes(pbi, mi, mi_row, mi_col, r);

   } else {

-    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);

-    set_scale_factors(xd,

-                      mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,

-                      pbi->common.active_ref_scale);

+    read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);

-#if CONFIG_CODE_NONZEROCOUNT

-  if (mbmi->sb_type == BLOCK_SIZE_SB64X64)

-    read_nzcs_sb64(cm, xd, mb_row, mb_col, bc);

-  else if (mbmi->sb_type == BLOCK_SIZE_SB32X32)

-    read_nzcs_sb32(cm, xd, mb_row, mb_col, bc);

-  else

-    read_nzcs_mb16(cm, xd, mb_row, mb_col, bc);

-#endif  // CONFIG_CODE_NONZEROCOUNT

-  if (mbmi->sb_type) {

-    const int n_mbs = 1 << mbmi->sb_type;

-    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);

-    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);

+  if (1) {

+    const int bw = 1 << mi_width_log2(mbmi->sb_type);

+    const int bh = 1 << mi_height_log2(mbmi->sb_type);

+    const int y_mis = MIN(bh, cm->mi_rows - mi_row);

+    const int x_mis = MIN(bw, cm->mi_cols - mi_col);

     const int mis = cm->mode_info_stride;

     int x, y;

-    for (y = 0; y < y_mbs; y++) {

-      for (x = !y; x < x_mbs; x++) {

+    for (y = 0; y < y_mis; y++)

+      for (x = !y; x < x_mis; x++)

         mi[y * mis + x] = *mi;

-      }

-    }

-  } else {

-    update_blockd_bmi(xd);

--- a/vp9/decoder/vp9_decodemv.h

+++ b/vp9/decoder/vp9_decodemv.h

@@ -17,7 +17,7 @@

                            MACROBLOCKD* const xd,

                            int mb_row,

                            int mb_col,

-                           BOOL_DECODER* const bc);

-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);

+                           vp9_reader *r);

+void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r);

 #endif  // VP9_DECODER_VP9_DECODEMV_H_

--- a/vp9/decoder/vp9_decodframe.c

+++ b/vp9/decoder/vp9_decodframe.c

@@ -8,36 +8,32 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

-#include "vp9/decoder/vp9_onyxd_int.h"

+#include "./vp9_rtcd.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vpx_scale/vpx_scale.h"

+#include "vp9/common/vp9_extend.h"

+#include "vp9/common/vp9_modecont.h"

 #include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_header.h"

 #include "vp9/common/vp9_reconintra.h"

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_entropy.h"

-#include "vp9/decoder/vp9_decodframe.h"

-#include "vp9/decoder/vp9_detokenize.h"

 #include "vp9/common/vp9_invtrans.h"

 #include "vp9/common/vp9_alloccommon.h"

 #include "vp9/common/vp9_entropymode.h"

 #include "vp9/common/vp9_quant_common.h"

-#include "vpx_scale/vpx_scale.h"

-#include "vp9/common/vp9_setupintrarecon.h"

-#include "vp9/decoder/vp9_decodemv.h"

-#include "vp9/common/vp9_extend.h"

-#include "vp9/common/vp9_modecont.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp9/decoder/vp9_dboolhuff.h"

 #include "vp9/common/vp9_seg_common.h"

 #include "vp9/common/vp9_tile_common.h"

-#include "vp9_rtcd.h"

-#include <assert.h>

-#include <stdio.h>

+#include "vp9/decoder/vp9_dboolhuff.h"

+#include "vp9/decoder/vp9_decodframe.h"

+#include "vp9/decoder/vp9_detokenize.h"

+#include "vp9/decoder/vp9_decodemv.h"

+#include "vp9/decoder/vp9_onyxd_int.h"

+#include "vp9/decoder/vp9_read_bit_buffer.h"

-#define COEFCOUNT_TESTING

 // #define DEC_DEBUG

 #ifdef DEC_DEBUG

@@ -44,24 +40,111 @@

 int dec_debug = 0;

 #endif

-static int read_le16(const uint8_t *p) {

-  return (p[1] << 8) | p[0];

+static int read_be32(const uint8_t *p) {

+  return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];

-static int read_le32(const uint8_t *p) {

-  return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];

-}

 // len == 0 is not allowed

-static int read_is_valid(const unsigned char *start, size_t len,

-                         const unsigned char *end) {

+static int read_is_valid(const uint8_t *start, size_t len,

+                         const uint8_t *end) {

   return start + len > start && start + len <= end;

+static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {

+  if (lossless) {

+    pc->txfm_mode = ONLY_4X4;

+  } else {

+    pc->txfm_mode = vp9_read_literal(r, 2);

+    if (pc->txfm_mode == ALLOW_32X32)

+      pc->txfm_mode += vp9_read_bit(r);

+    if (pc->txfm_mode == TX_MODE_SELECT) {

+      int i, j;

+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {

+        for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {

+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+            pc->fc.tx_probs_8x8p[i][j] =

+                vp9_read_prob_diff_update(r, pc->fc.tx_probs_8x8p[i][j]);

+        }

+      }

+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {

+        for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {

+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+            pc->fc.tx_probs_16x16p[i][j] =

+                vp9_read_prob_diff_update(r, pc->fc.tx_probs_16x16p[i][j]);

+        }

+      }

+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {

+        for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {

+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))

+            pc->fc.tx_probs_32x32p[i][j] =

+                vp9_read_prob_diff_update(r, pc->fc.tx_probs_32x32p[i][j]);

+        }

+      }

+    }

+  }

+}

+static int get_unsigned_bits(unsigned int num_values) {

+  int cat = 0;

+  if (num_values <= 1)

+    return 0;

+  num_values--;

+  while (num_values > 0) {

+    cat++;

+    num_values >>= 1;

+  }

+  return cat;

+}

+static int inv_recenter_nonneg(int v, int m) {

+  if (v > 2 * m)

+    return v;

+  return v % 2 ? m - (v + 1) / 2 : m + v / 2;

+}

+static int decode_uniform(vp9_reader *r, int n) {

+  int v;

+  const int l = get_unsigned_bits(n);

+  const int m = (1 << l) - n;

+  if (!l)

+    return 0;

+  v = vp9_read_literal(r, l - 1);

+  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);

+}

+static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {

+  int i = 0, mk = 0, word;

+  while (1) {

+    const int b = i ? k + i - 1 : k;

+    const int a = 1 << b;

+    if (num_syms <= mk + 3 * a) {

+      word = decode_uniform(r, num_syms - mk) + mk;

+      break;

+    } else {

+      if (vp9_read_bit(r)) {

+        i++;

+        mk += a;

+      } else {

+        word = vp9_read_literal(r, b) + mk;

+        break;

+      }

+    }

+  }

+  return word;

+}

+static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {

+  const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));

+  return data > max ? max : data;

+}

 static int merge_index(int v, int n, int modulus) {

   int max1 = (n - 1 - modulus / 2) / modulus + 1;

-  if (v < max1) v = v * modulus + modulus / 2;

-  else {

+  if (v < max1) {

+    v = v * modulus + modulus / 2;

+  } else {

     int w;

     v -= max1;

     w = v;

@@ -73,1166 +156,427 @@

 static int inv_remap_prob(int v, int m) {

-  const int n = 256;

-  const int modulus = MODULUS_PARAM;

+  const int n = 255;

-  v = merge_index(v, n - 1, modulus);

+  v = merge_index(v, n - 1, MODULUS_PARAM);

+  m--;

   if ((m << 1) <= n) {

-    return vp9_inv_recenter_nonneg(v + 1, m);

+    return 1 + inv_recenter_nonneg(v + 1, m);

   } else {

-    return n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);

+    return n - inv_recenter_nonneg(v + 1, n - 1 - m);

-static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {

-  int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);

+vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp) {

+  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);

   return (vp9_prob)inv_remap_prob(delp, oldp);

-void vp9_init_de_quantizer(VP9D_COMP *pbi) {

-  int i;

+void vp9_init_dequantizer(VP9_COMMON *pc) {

   int q;

-  VP9_COMMON *const pc = &pbi->common;

   for (q = 0; q < QINDEX_RANGE; q++) {

-    pc->Y1dequant[q][0] = (int16_t)vp9_dc_quant(q, pc->y1dc_delta_q);

-    pc->UVdequant[q][0] = (int16_t)vp9_dc_uv_quant(q, pc->uvdc_delta_q);

+    // DC value

+    pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);

+    pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);

-    /* all the ac values =; */

-    for (i = 1; i < 16; i++) {

-      int rc = vp9_default_zig_zag1d_4x4[i];

-      pc->Y1dequant[q][rc] = (int16_t)vp9_ac_yquant(q);

-      pc->UVdequant[q][rc] = (int16_t)vp9_ac_uv_quant(q, pc->uvac_delta_q);

-    }

+    // AC values

+    pc->y_dequant[q][1] = vp9_ac_quant(q, 0);

+    pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);

-static int get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex) {

-  // Set the Q baseline allowing for any segment level adjustment

-  if (vp9_segfeature_active(mb, segment_id, SEG_LVL_ALT_Q)) {

-    if (mb->mb_segment_abs_delta == SEGMENT_ABSDATA)

-      return vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q);  // Abs Value

-    else

-      return clamp(base_qindex + vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q),

-                   0, MAXQ);  // Delta Value

-  } else {

-    return base_qindex;

-  }

-}

-static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) {

+static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {

   int i;

+  const int segment_id = xd->mode_info_context->mbmi.segment_id;

+  xd->q_index = vp9_get_qindex(xd, segment_id, pc->base_qindex);

-  VP9_COMMON *const pc = &pbi->common;

-  const int segment_id = mb->mode_info_context->mbmi.segment_id;

-  const int qindex = get_qindex(mb, segment_id, pc->base_qindex);

-  mb->q_index = qindex;

+  xd->plane[0].dequant = pc->y_dequant[xd->q_index];

+  for (i = 1; i < MAX_MB_PLANE; i++)

+    xd->plane[i].dequant = pc->uv_dequant[xd->q_index];

+}

-  for (i = 0; i < 16; i++)

-    mb->block[i].dequant = pc->Y1dequant[qindex];

+static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                         int ss_txfrm_size, void *arg) {

+  MACROBLOCKD* const xd = arg;

+  int16_t* const qcoeff = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);

+  const int stride = xd->plane[plane].dst.stride;

+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,

+                                                       block, ss_txfrm_size);

+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,

+                                                 raster_block,

+                                                 xd->plane[plane].dst.buf,

+                                                 stride);

-  for (i = 16; i < 24; i++)

-    mb->block[i].dequant = pc->UVdequant[qindex];

+  TX_TYPE tx_type;

-  if (mb->lossless) {

-    assert(qindex == 0);

-    mb->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;

-    mb->inv_txm4x4        = vp9_short_iwalsh4x4;

-    mb->itxm_add          = vp9_dequant_idct_add_lossless_c;

-    mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block_lossless_c;

-    mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;

-  } else {

-    mb->inv_txm4x4_1      = vp9_short_idct4x4_1;

-    mb->inv_txm4x4        = vp9_short_idct4x4;

-    mb->itxm_add          = vp9_dequant_idct_add;

-    mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block;

-    mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;

+  switch (ss_txfrm_size / 2) {

+    case TX_4X4:

+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        xd->itxm_add(qcoeff, dst, stride, xd->plane[plane].eobs[block]);

+      else

+        vp9_iht_add_c(tx_type, qcoeff, dst, stride,

+                      xd->plane[plane].eobs[block]);

+      break;

+    case TX_8X8:

+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;

+      vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride,

+                        xd->plane[plane].eobs[block]);

+      break;

+    case TX_16X16:

+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

+      vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride,

+                          xd->plane[plane].eobs[block]);

+      break;

+    case TX_32X32:

+      vp9_idct_add_32x32(qcoeff, dst, stride, xd->plane[plane].eobs[block]);

+      break;

-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it

- *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.

- */

-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                          int mb_row, int mb_col) {

-  BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                               int ss_txfrm_size, void *arg) {

+  MACROBLOCKD* const xd = arg;

+  int16_t* const qcoeff = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);

+  const int stride = xd->plane[plane].dst.stride;

+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,

+                                                       block, ss_txfrm_size);

+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,

+                                                 raster_block,

+                                                 xd->plane[plane].dst.buf,

+                                                 stride);

+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);

+  TX_TYPE tx_type;

+  int mode, b_mode;

+  int plane_b_size;

+  int tx_ib = raster_block >> tx_size;

+  mode = plane == 0? xd->mode_info_context->mbmi.mode:

+                     xd->mode_info_context->mbmi.uv_mode;

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    if (sb_type == BLOCK_SIZE_SB64X64) {

-      vp9_build_intra_predictors_sb64uv_s(xd);

-      vp9_build_intra_predictors_sb64y_s(xd);

-    } else if (sb_type == BLOCK_SIZE_SB32X32) {

-      vp9_build_intra_predictors_sbuv_s(xd);

-      vp9_build_intra_predictors_sby_s(xd);

-    } else {

-      vp9_build_intra_predictors_mbuv_s(xd);

-      vp9_build_intra_predictors_mby_s(xd);

-    }

-  } else {

-    if (sb_type == BLOCK_SIZE_SB64X64) {

-      vp9_build_inter64x64_predictors_sb(xd,

-                                         xd->dst.y_buffer,

-                                         xd->dst.u_buffer,

-                                         xd->dst.v_buffer,

-                                         xd->dst.y_stride,

-                                         xd->dst.uv_stride,

-                                         mb_row, mb_col);

-    } else if (sb_type == BLOCK_SIZE_SB32X32) {

-      vp9_build_inter32x32_predictors_sb(xd,

-                                         xd->dst.y_buffer,

-                                         xd->dst.u_buffer,

-                                         xd->dst.v_buffer,

-                                         xd->dst.y_stride,

-                                         xd->dst.uv_stride,

-                                         mb_row, mb_col);

-    } else {

-      vp9_build_inter16x16_predictors_mb(xd,

-                                         xd->dst.y_buffer,

-                                         xd->dst.u_buffer,

-                                         xd->dst.v_buffer,

-                                         xd->dst.y_stride,

-                                         xd->dst.uv_stride,

-                                         mb_row, mb_col);

-    }

-  }

-}

-static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                         BOOL_DECODER* const bc) {

-  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);

-#if 0  // def DEC_DEBUG

-  if (dec_debug) {

-    int i;

-    printf("\n");

-    printf("qcoeff 16x16\n");

-    for (i = 0; i < 400; i++) {

-      printf("%3d ", xd->qcoeff[i]);

-      if (i % 16 == 15) printf("\n");

-    }

-    printf("\n");

-    printf("predictor\n");

-    for (i = 0; i < 400; i++) {

-      printf("%3d ", xd->predictor[i]);

-      if (i % 16 == 15) printf("\n");

-    }

-  }

-#endif

-  if (tx_type != DCT_DCT) {

-    vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,

-                                    xd->block[0].dequant, xd->predictor,

-                                    xd->dst.y_buffer, 16, xd->dst.y_stride,

-                                    xd->eobs[0]);

+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 && plane == 0) {

+    assert(bsize == BLOCK_SIZE_SB8X8);

+    b_mode = xd->mode_info_context->bmi[raster_block].as_mode.first;

   } else {

-    vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,

-                               xd->predictor, xd->dst.y_buffer,

-                               16, xd->dst.y_stride, xd->eobs[0]);

+    b_mode = mode;

-  vp9_dequant_idct_add_uv_block_8x8(

-      xd->qcoeff + 16 * 16, xd->block[16].dequant,

-      xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-      xd->dst.uv_stride, xd);

-}

-static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                       BOOL_DECODER* const bc) {

-  // First do Y

-  // if the first one is DCT_DCT assume all the rest are as well

-  TX_TYPE tx_type = get_tx_type_8x8(xd, 0);

-#if 0  // def DEC_DEBUG

-  if (dec_debug) {

-    int i;

-    printf("\n");

-    printf("qcoeff 8x8\n");

-    for (i = 0; i < 384; i++) {

-      printf("%3d ", xd->qcoeff[i]);

-      if (i % 16 == 15) printf("\n");

-    }

+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {

+    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);

-#endif

-  if (tx_type != DCT_DCT || xd->mode_info_context->mbmi.mode == I8X8_PRED) {

-    int i;

-    for (i = 0; i < 4; i++) {

-      int ib = vp9_i8x8_block[i];

-      int idx = (ib & 0x02) ? (ib + 2) : ib;

-      int16_t *q  = xd->block[idx].qcoeff;

-      int16_t *dq = xd->block[0].dequant;

-      uint8_t *pre = xd->block[ib].predictor;

-      uint8_t *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;

-      int stride = xd->dst.y_stride;

-      BLOCKD *b = &xd->block[ib];

-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {

-        int i8x8mode = b->bmi.as_mode.first;

-        vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);

-      }

-      tx_type = get_tx_type_8x8(xd, ib);

-      if (tx_type != DCT_DCT) {

-        vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,

-                                      xd->eobs[idx]);

-      } else {

-        vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride,

-                                   xd->eobs[idx]);

-      }

-    }

-  } else {

-    vp9_dequant_idct_add_y_block_8x8(xd->qcoeff,

-                                     xd->block[0].dequant,

-                                     xd->predictor,

-                                     xd->dst.y_buffer,

-                                     xd->dst.y_stride,

-                                     xd);

-  }

-  // Now do UV

-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {

-    int i;

-    for (i = 0; i < 4; i++) {

-      int ib = vp9_i8x8_block[i];

-      BLOCKD *b = &xd->block[ib];

-      int i8x8mode = b->bmi.as_mode.first;

+  plane_b_size = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size,

+                          b_mode, dst, xd->plane[plane].dst.stride);

-      b = &xd->block[16 + i];

-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);

-      b = &xd->block[20 + i];

-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);

-    }

-  } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {

-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,

-         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-         xd->dst.uv_stride, xd);

-  } else {

-    vp9_dequant_idct_add_uv_block_8x8

-        (xd->qcoeff + 16 * 16, xd->block[16].dequant,

-         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-         xd->dst.uv_stride, xd);

+  switch (ss_txfrm_size / 2) {

+    case TX_4X4:

+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        xd->itxm_add(qcoeff, dst, stride, xd->plane[plane].eobs[block]);

+      else

+        vp9_iht_add_c(tx_type, qcoeff, dst, stride,

+                      xd->plane[plane].eobs[block]);

+      break;

+    case TX_8X8:

+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;

+      vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride,

+                        xd->plane[plane].eobs[block]);

+      break;

+    case TX_16X16:

+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

+      vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride,

+                          xd->plane[plane].eobs[block]);

+      break;

+    case TX_32X32:

+      vp9_idct_add_32x32(qcoeff, dst, stride, xd->plane[plane].eobs[block]);

+      break;

-#if 0  // def DEC_DEBUG

-  if (dec_debug) {

-    int i;

-    printf("\n");

-    printf("predictor\n");

-    for (i = 0; i < 384; i++) {

-      printf("%3d ", xd->predictor[i]);

-      if (i % 16 == 15) printf("\n");

-    }

-  }

-#endif

-static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                       BOOL_DECODER* const bc) {

-  TX_TYPE tx_type;

-  int i, eobtotal = 0;

-  MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;

-#if 0  // def DEC_DEBUG

-  if (dec_debug) {

-    int i;

-    printf("\n");

-    printf("predictor\n");

-    for (i = 0; i < 384; i++) {

-      printf("%3d ", xd->predictor[i]);

-      if (i % 16 == 15) printf("\n");

-    }

-  }

-#endif

-  if (mode == I8X8_PRED) {

-    for (i = 0; i < 4; i++) {

-      int ib = vp9_i8x8_block[i];

-      const int iblock[4] = {0, 1, 4, 5};

-      int j;

-      BLOCKD *b = &xd->block[ib];

-      int i8x8mode = b->bmi.as_mode.first;

-      vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);

-      for (j = 0; j < 4; j++) {

-        b = &xd->block[ib + iblock[j]];

-        tx_type = get_tx_type_4x4(xd, ib + iblock[j]);

-        if (tx_type != DCT_DCT) {

-          vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

-                                    b->dequant, b->predictor,

-                                    *(b->base_dst) + b->dst, 16,

-                                    b->dst_stride, xd->eobs[ib + iblock[j]]);

-        } else {

-          xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

-                       *(b->base_dst) + b->dst, 16, b->dst_stride,

-                       xd->eobs[ib + iblock[j]]);

-        }

-      }

-      b = &xd->block[16 + i];

-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);

-      b = &xd->block[20 + i];

-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);

-    }

-  } else if (mode == B_PRED) {

-    for (i = 0; i < 16; i++) {

-      BLOCKD *b = &xd->block[i];

-      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;

-#if CONFIG_NEWBINTRAMODES

-      xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =

-          vp9_find_bpred_context(xd, b);

-#endif

-      if (!xd->mode_info_context->mbmi.mb_skip_coeff)

-        eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);

+static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,

+                        int mi_row, int mi_col,

+                        vp9_reader *r, BLOCK_SIZE_TYPE bsize) {

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-      vp9_intra4x4_predict(xd, b, b_mode, b->predictor);

-      tx_type = get_tx_type_4x4(xd, i);

-      if (tx_type != DCT_DCT) {

-        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

-                                  b->dequant, b->predictor,

-                                  *(b->base_dst) + b->dst, 16, b->dst_stride,

-                                  xd->eobs[i]);

-      } else {

-        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

-                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);

-      }

-    }

-    if (!xd->mode_info_context->mbmi.mb_skip_coeff) {

-      vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc);

-    }

-    vp9_build_intra_predictors_mbuv(xd);

-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,

-                           xd->block[16].dequant,

-                           xd->predictor + 16 * 16,

-                           xd->dst.u_buffer,

-                           xd->dst.v_buffer,

-                           xd->dst.uv_stride,

-                           xd);

-  } else if (mode == SPLITMV || get_tx_type_4x4(xd, 0) == DCT_DCT) {

-    xd->itxm_add_y_block(xd->qcoeff,

-                          xd->block[0].dequant,

-                          xd->predictor,

-                          xd->dst.y_buffer,

-                          xd->dst.y_stride,

-                          xd);

-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,

-                           xd->block[16].dequant,

-                           xd->predictor + 16 * 16,

-                           xd->dst.u_buffer,

-                           xd->dst.v_buffer,

-                           xd->dst.uv_stride,

-                           xd);

-  } else {

-#if 0  // def DEC_DEBUG

-    if (dec_debug) {

-      int i;

-      printf("\n");

-      printf("qcoeff 4x4\n");

-      for (i = 0; i < 400; i++) {

-        printf("%3d ", xd->qcoeff[i]);

-        if (i % 16 == 15) printf("\n");

-      }

-      printf("\n");

-      printf("predictor\n");

-      for (i = 0; i < 400; i++) {

-        printf("%3d ", xd->predictor[i]);

-        if (i % 16 == 15) printf("\n");

-      }

-    }

-#endif

-    for (i = 0; i < 16; i++) {

-      BLOCKD *b = &xd->block[i];

-      tx_type = get_tx_type_4x4(xd, i);

-      if (tx_type != DCT_DCT) {

-        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

-                                  b->dequant, b->predictor,

-                                  *(b->base_dst) + b->dst, 16,

-                                  b->dst_stride, xd->eobs[i]);

-      } else {

-        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

-                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);

-      }

-    }

-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,

-                           xd->block[16].dequant,

-                           xd->predictor + 16 * 16,

-                           xd->dst.u_buffer,

-                           xd->dst.v_buffer,

-                           xd->dst.uv_stride,

-                           xd);

-  }

-}

+  assert(mbmi->ref_frame[0] != INTRA_FRAME);

-static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                                int mb_row, int mb_col,

-                                BOOL_DECODER* const bc) {

-  int n, eobtotal;

-  VP9_COMMON *const pc = &pbi->common;

-  MODE_INFO *mi = xd->mode_info_context;

-  const int mis = pc->mode_info_stride;

+  if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only))

+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);

-  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64);

+  // prediction

+  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);

-  if (pbi->common.frame_type != KEY_FRAME)

-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);

+  if (mbmi->mb_skip_coeff) {

+    vp9_reset_sb_tokens_context(xd, bsize);

+  } else {

+    // re-initialize macroblock dequantizer before detokenization

+    if (xd->segmentation_enabled)

+      mb_init_dequantizer(&pbi->common, xd);

-  // re-initialize macroblock dequantizer before detokenization

-  if (xd->segmentation_enabled)

-    mb_init_dequantizer(pbi, xd);

-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    vp9_reset_sb64_tokens_context(xd);

-    /* Special case:  Force the loopfilter to skip when eobtotal and

-     * mb_skip_coeff are zero.

-     */

-    skip_recon_mb(pbi, xd, mb_row, mb_col);

-    return;

+    if (!vp9_reader_has_error(r)) {

+      vp9_decode_tokens(pbi, xd, r, bsize);

+    }

+  foreach_transformed_block(xd, bsize, decode_block, xd);

+}

-  /* do prediction */

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    vp9_build_intra_predictors_sb64y_s(xd);

-    vp9_build_intra_predictors_sb64uv_s(xd);

+static void decode_sb_intra(VP9D_COMP *pbi, MACROBLOCKD *xd,

+                          int mi_row, int mi_col,

+                          vp9_reader *r, BLOCK_SIZE_TYPE bsize) {

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  if (mbmi->mb_skip_coeff) {

+    vp9_reset_sb_tokens_context(xd, bsize);

   } else {

-    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,

-                                       xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride,

-                                       mb_row, mb_col);

-  }

+    // re-initialize macroblock dequantizer before detokenization

+    if (xd->segmentation_enabled)

+      mb_init_dequantizer(&pbi->common, xd);

-  /* dequantization and idct */

-  eobtotal = vp9_decode_sb64_tokens(pbi, xd, bc);

-  if (eobtotal == 0) {  // skip loopfilter

-    for (n = 0; n < 16; n++) {

-      const int x_idx = n & 3, y_idx = n >> 2;

-      if (mb_col + x_idx < pc->mb_cols && mb_row + y_idx < pc->mb_rows)

-        mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

+    if (!vp9_reader_has_error(r)) {

+      vp9_decode_tokens(pbi, xd, r, bsize);

-  } else {

-    switch (xd->mode_info_context->mbmi.txfm_size) {

-      case TX_32X32:

-        for (n = 0; n < 4; n++) {

-          const int x_idx = n & 1, y_idx = n >> 1;

-          const int y_offset = x_idx * 32 + y_idx * xd->dst.y_stride * 32;

-          vp9_dequant_idct_add_32x32(xd->qcoeff + n * 1024,

-              xd->block[0].dequant,

-              xd->dst.y_buffer + y_offset,

-              xd->dst.y_buffer + y_offset,

-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 64]);

-        }

-        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096,

-            xd->block[16].dequant, xd->dst.u_buffer, xd->dst.u_buffer,

-            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256]);

-        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096 + 1024,

-            xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer,

-            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320]);

-        break;

-      case TX_16X16:

-        for (n = 0; n < 16; n++) {

-          const int x_idx = n & 3, y_idx = n >> 2;

-          const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16;

-          const TX_TYPE tx_type = get_tx_type_16x16(xd,

-                                                    (y_idx * 16 + x_idx) * 4);

-          if (tx_type == DCT_DCT) {

-            vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

-          } else {

-            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

-          }

-        }

-        for (n = 0; n < 4; n++) {

-          const int x_idx = n & 1, y_idx = n >> 1;

-          const int uv_offset = y_idx * 16 * xd->dst.uv_stride + x_idx * 16;

-          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + n * 256,

-              xd->block[16].dequant,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 16]);

-          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + 1024 + n * 256,

-              xd->block[20].dequant,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 16]);

-        }

-        break;

-      case TX_8X8:

-        for (n = 0; n < 64; n++) {

-          const int x_idx = n & 7, y_idx = n >> 3;

-          const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8;

-          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);

-          if (tx_type == DCT_DCT) {

-            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

-          } else {

-            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

-          }

-        }

-        for (n = 0; n < 16; n++) {

-          const int x_idx = n & 3, y_idx = n >> 2;

-          const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8;

-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096,

-              xd->block[16].dequant,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 4]);

-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096 + 1024,

-              xd->block[20].dequant,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 4]);

-        }

-        break;

-      case TX_4X4:

-        for (n = 0; n < 256; n++) {

-          const int x_idx = n & 15, y_idx = n >> 4;

-          const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4;

-          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);

-          if (tx_type == DCT_DCT) {

-            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

-          } else {

-            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

-          }

-        }

-        for (n = 0; n < 64; n++) {

-          const int x_idx = n & 7, y_idx = n >> 3;

-          const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4;

-          xd->itxm_add(xd->qcoeff + 4096 + n * 16,

-              xd->block[16].dequant,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n]);

-          xd->itxm_add(xd->qcoeff + 4096 + 1024 + n * 16,

-              xd->block[20].dequant,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n]);

-        }

-        break;

-      default: assert(0);

-    }

+  foreach_transformed_block(xd, bsize, decode_block_intra, xd);

-static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                                int mb_row, int mb_col,

-                                BOOL_DECODER* const bc) {

+static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,

+                      vp9_reader *r, BLOCK_SIZE_TYPE bsize) {

+  const int bwl = mi_width_log2(bsize), bhl = mi_height_log2(bsize);

+  const int bw = 1 << bwl, bh = 1 << bhl;

   int n, eobtotal;

   VP9_COMMON *const pc = &pbi->common;

+  MODE_INFO *const mi = xd->mode_info_context;

+  MB_MODE_INFO *const mbmi = &mi->mbmi;

   const int mis = pc->mode_info_stride;

-  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32);

+  assert(mbmi->sb_type == bsize);

+  assert(mbmi->ref_frame[0] != INTRA_FRAME);

   if (pbi->common.frame_type != KEY_FRAME)

-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);

+    vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);

-  // re-initialize macroblock dequantizer before detokenization

-  if (xd->segmentation_enabled)

-    mb_init_dequantizer(pbi, xd);

+  // generate prediction

+  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);

-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    vp9_reset_sb_tokens_context(xd);

-    /* Special case:  Force the loopfilter to skip when eobtotal and

-     * mb_skip_coeff are zero.

-     */

-    skip_recon_mb(pbi, xd, mb_row, mb_col);

-    return;

-  }

-  /* do prediction */

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    vp9_build_intra_predictors_sby_s(xd);

-    vp9_build_intra_predictors_sbuv_s(xd);

+  if (mbmi->mb_skip_coeff) {

+    vp9_reset_sb_tokens_context(xd, bsize);

   } else {

-    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

-                                       xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride,

-                                       mb_row, mb_col);

-  }

+    // re-initialize macroblock dequantizer before detokenization

+    if (xd->segmentation_enabled)

+      mb_init_dequantizer(pc, xd);

-  /* dequantization and idct */

-  eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);

-  if (eobtotal == 0) {  // skip loopfilter

-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-    if (mb_col + 1 < pc->mb_cols)

-      xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;

-    if (mb_row + 1 < pc->mb_rows) {

-      xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;

-      if (mb_col + 1 < pc->mb_cols)

-        xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;

-    }

-  } else {

-    switch (xd->mode_info_context->mbmi.txfm_size) {

-      case TX_32X32:

-        vp9_dequant_idct_add_32x32(xd->qcoeff, xd->block[0].dequant,

-                                   xd->dst.y_buffer, xd->dst.y_buffer,

-                                   xd->dst.y_stride, xd->dst.y_stride,

-                                   xd->eobs[0]);

-        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,

-                                              xd->block[16].dequant,

-                                              xd->dst.u_buffer,

-                                              xd->dst.v_buffer,

-                                              xd->dst.uv_stride, xd);

-        break;

-      case TX_16X16:

-        for (n = 0; n < 4; n++) {

-          const int x_idx = n & 1, y_idx = n >> 1;

-          const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16;

-          const TX_TYPE tx_type = get_tx_type_16x16(xd,

-                                                    (y_idx * 8 + x_idx) * 4);

-          if (tx_type == DCT_DCT) {

-            vp9_dequant_idct_add_16x16(

-                xd->qcoeff + n * 256, xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

-          } else {

-            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

-          }

-        }

-        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,

-                                              xd->block[16].dequant,

-                                              xd->dst.u_buffer,

-                                              xd->dst.v_buffer,

-                                              xd->dst.uv_stride, xd);

-        break;

-      case TX_8X8:

-        for (n = 0; n < 16; n++) {

-          const int x_idx = n & 3, y_idx = n >> 2;

-          const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8;

-          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);

-          if (tx_type == DCT_DCT) {

-            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

-          } else {

-            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

-          }

-        }

-        for (n = 0; n < 4; n++) {

-          const int x_idx = n & 1, y_idx = n >> 1;

-          const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8;

-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1024,

-              xd->block[16].dequant,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n * 4]);

-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1280,

-              xd->block[20].dequant,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n * 4]);

-        }

-        break;

-      case TX_4X4:

-        for (n = 0; n < 64; n++) {

-          const int x_idx = n & 7, y_idx = n >> 3;

-          const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4;

+    // dequantization and idct

+    eobtotal = vp9_decode_tokens(pbi, xd, r, bsize);

+    if (eobtotal == 0) {  // skip loopfilter

+      for (n = 0; n < bw * bh; n++) {

+        const int x_idx = n & (bw - 1), y_idx = n >> bwl;

-          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);

-          if (tx_type == DCT_DCT) {

-            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

-          } else {

-            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,

-                xd->block[0].dequant,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_buffer + y_offset,

-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

-          }

-        }

-        for (n = 0; n < 16; n++) {

-          const int x_idx = n & 3, y_idx = n >> 2;

-          const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4;

-          xd->itxm_add(xd->qcoeff + 1024 + n * 16,

-              xd->block[16].dequant,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.u_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n]);

-          xd->itxm_add(xd->qcoeff + 1280 + n * 16,

-              xd->block[20].dequant,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.v_buffer + uv_offset,

-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n]);

-        }

-        break;

-      default: assert(0);

-    }

-  }

-}

-static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                              int mb_row, unsigned int mb_col,

-                              BOOL_DECODER* const bc) {

-  int eobtotal = 0;

-  MB_PREDICTION_MODE mode;

-  int tx_size;

-  assert(!xd->mode_info_context->mbmi.sb_type);

-  // re-initialize macroblock dequantizer before detokenization

-  if (xd->segmentation_enabled)

-    mb_init_dequantizer(pbi, xd);

-  tx_size = xd->mode_info_context->mbmi.txfm_size;

-  mode = xd->mode_info_context->mbmi.mode;

-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    vp9_reset_mb_tokens_context(xd);

-  } else if (!bool_error(bc)) {

-    if (mode != B_PRED)

-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);

-  }

-  //mode = xd->mode_info_context->mbmi.mode;

-  if (pbi->common.frame_type != KEY_FRAME)

-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,

-                             &pbi->common);

-  if (eobtotal == 0 &&

-      mode != B_PRED &&

-      mode != SPLITMV &&

-      mode != I8X8_PRED &&

-      !bool_error(bc)) {

-    /* Special case:  Force the loopfilter to skip when eobtotal and

-       mb_skip_coeff are zero. */

-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-    skip_recon_mb(pbi, xd, mb_row, mb_col);

-    return;

-  }

-#if 0  // def DEC_DEBUG

-  if (dec_debug)

-    printf("Decoding mb:  %d %d\n", xd->mode_info_context->mbmi.mode, tx_size);

-#endif

-  // moved to be performed before detokenization

-  //  if (xd->segmentation_enabled)

-  //    mb_init_dequantizer(pbi, xd);

-  /* do prediction */

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    if (mode != I8X8_PRED) {

-      vp9_build_intra_predictors_mbuv(xd);

-      if (mode != B_PRED) {

-        vp9_build_intra_predictors_mby(xd);

+        if (mi_col + x_idx < pc->mi_cols && mi_row + y_idx < pc->mi_rows)

+          mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = 1;

+    } else {

+      foreach_transformed_block(xd, bsize, decode_block, xd);

-  } else {

-#if 0  // def DEC_DEBUG

-  if (dec_debug)

-    printf("Decoding mb:  %d %d interp %d\n",

-           xd->mode_info_context->mbmi.mode, tx_size,

-           xd->mode_info_context->mbmi.interp_filter);

-#endif

-    vp9_build_inter_predictors_mb(xd, mb_row, mb_col);

-  if (tx_size == TX_16X16) {

-    decode_16x16(pbi, xd, bc);

-  } else if (tx_size == TX_8X8) {

-    decode_8x8(pbi, xd, bc);

-  } else {

-    decode_4x4(pbi, xd, bc);

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int i, j;

-    printf("\n");

-    printf("predictor y\n");

-    for (i = 0; i < 16; i++) {

-      for (j = 0; j < 16; j++)

-        printf("%3d ", xd->predictor[i * 16 + j]);

-      printf("\n");

-    }

-    printf("\n");

-    printf("final y\n");

-    for (i = 0; i < 16; i++) {

-      for (j = 0; j < 16; j++)

-        printf("%3d ", xd->dst.y_buffer[i * xd->dst.y_stride + j]);

-      printf("\n");

-    }

-    printf("\n");

-    printf("final u\n");

-    for (i = 0; i < 8; i++) {

-      for (j = 0; j < 8; j++)

-        printf("%3d ", xd->dst.u_buffer[i * xd->dst.uv_stride + j]);

-      printf("\n");

-    }

-    printf("\n");

-    printf("final v\n");

-    for (i = 0; i < 8; i++) {

-      for (j = 0; j < 8; j++)

-        printf("%3d ", xd->dst.v_buffer[i * xd->dst.uv_stride + j]);

-      printf("\n");

-    }

-    fflush(stdout);

-  }

-#endif

-static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {

-  int ret_val = 0;

-  if (vp9_read_bit(bc)) {

-    ret_val = vp9_read_literal(bc, 4);

-    if (vp9_read_bit(bc))

-      ret_val = -ret_val;

-  }

-  /* Trigger a quantizer update if the delta-q value has changed */

-  if (ret_val != prev)

-    *q_update = 1;

-  return ret_val;

-}

-#ifdef PACKET_TESTING

-#include <stdio.h>

-FILE *vpxlog = 0;

-#endif

-static void set_offsets(VP9D_COMP *pbi, int block_size,

-                        int mb_row, int mb_col) {

+static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,

+                        int mi_row, int mi_col) {

+  const int bh = 1 << mi_height_log2(bsize);

+  const int bw = 1 << mi_width_log2(bsize);

   VP9_COMMON *const cm = &pbi->common;

   MACROBLOCKD *const xd = &pbi->mb;

-  const int mis = cm->mode_info_stride;

-  const int idx = mis * mb_row + mb_col;

-  const int dst_fb_idx = cm->new_fb_idx;

-  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;

-  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;

-  const int recon_yoffset = mb_row * 16 * recon_y_stride + 16 * mb_col;

-  const int recon_uvoffset = mb_row * 8 * recon_uv_stride + 8 * mb_col;

+  const int mi_idx = mi_row * cm->mode_info_stride + mi_col;

+  int i;

-  xd->mode_info_context = cm->mi + idx;

-  xd->mode_info_context->mbmi.sb_type = block_size >> 5;

-  xd->prev_mode_info_context = cm->prev_mi + idx;

-  xd->above_context = cm->above_context + mb_col;

-  xd->left_context = cm->left_context + (mb_row & 3);

+  xd->mode_info_context = cm->mi + mi_idx;

+  xd->mode_info_context->mbmi.sb_type = bsize;

+  // Special case: if prev_mi is NULL, the previous mode info context

+  // cannot be used.

+  xd->prev_mode_info_context = cm->prev_mi ?

+                                 cm->prev_mi + mi_idx : NULL;

-  // Distance of Mb to the various image edges.

-  // These are specified to 8th pel as they are always compared to

-  // values that are in 1/8th pel units

-  block_size >>= 4;  // in mb units

+  for (i = 0; i < MAX_MB_PLANE; i++) {

+    xd->plane[i].above_context = cm->above_context[i] +

+        (mi_col * 2 >> xd->plane[i].subsampling_x);

+    xd->plane[i].left_context = cm->left_context[i] +

+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);

+  }

+  xd->above_seg_context = cm->above_seg_context + mi_col;

+  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);

-  set_mb_row(cm, xd, mb_row, block_size);

-  set_mb_col(cm, xd, mb_col, block_size);

+  // Distance of Mb to the various image edges. These are specified to 8th pel

+  // as they are always compared to values that are in 1/8th pel units

+  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);

-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

+  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);

-static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) {

+static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {

   VP9_COMMON *const cm = &pbi->common;

   MACROBLOCKD *const xd = &pbi->mb;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  if (mbmi->ref_frame > INTRA_FRAME) {

+  if (mbmi->ref_frame[0] > INTRA_FRAME) {

     // Select the appropriate reference frame for this MB

-    int ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];

-    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];

-    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];

-    setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col,

-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

+    const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];

+    const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];

+    xd->scale_factor[0]    = cm->active_ref_scale[mbmi->ref_frame[0] - 1];

+    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];

+    setup_pre_planes(xd, cfg, NULL, mi_row, mi_col,

+                     xd->scale_factor, xd->scale_factor_uv);

+    xd->corrupted |= cfg->corrupted;

-    // propagate errors from reference frames

-    xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;

-    if (mbmi->second_ref_frame > INTRA_FRAME) {

+    if (mbmi->ref_frame[1] > INTRA_FRAME) {

       // Select the appropriate reference frame for this MB

-      int second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];

-      setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],

-                       mb_row, mb_col,

-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

-      // propagate errors from reference frames

-      xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;

+      const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];

+      const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];

+      xd->scale_factor[1]    = cm->active_ref_scale[mbmi->ref_frame[1] - 1];

+      xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];

+      setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col,

+                       xd->scale_factor, xd->scale_factor_uv);

+      xd->corrupted |= second_cfg->corrupted;

-/* Decode a row of Superblocks (2x2 region of MBs) */

-static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,

-                          int mb_row, MACROBLOCKD *xd,

-                          BOOL_DECODER* const bc) {

-  int mb_col;

+static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,

+                           vp9_reader *r, BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD *const xd = &pbi->mb;

-  // For a SB there are 2 left contexts, each pertaining to a MB row within

-  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));

+  if (bsize < BLOCK_SIZE_SB8X8)

+    if (xd->ab_index > 0)

+      return;

+  set_offsets(pbi, bsize, mi_row, mi_col);

+  vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);

+  set_refs(pbi, mi_row, mi_col);

-  for (mb_col = pc->cur_tile_mb_col_start;

-       mb_col < pc->cur_tile_mb_col_end; mb_col += 4) {

-    if (vp9_read(bc, pc->sb64_coded)) {

-#ifdef DEC_DEBUG

-      dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&

-                   mb_row == 8 && mb_col == 0);

-      if (dec_debug)

-        printf("Debug Decode SB64\n");

-#endif

-      set_offsets(pbi, 64, mb_row, mb_col);

-      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);

-      set_refs(pbi, 64, mb_row, mb_col);

-      decode_superblock64(pbi, xd, mb_row, mb_col, bc);

-      xd->corrupted |= bool_error(bc);

-    } else {

-      int j;

+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)

+    decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?

+                                     BLOCK_SIZE_SB8X8 : bsize);

+  else if (bsize < BLOCK_SIZE_SB8X8)

+    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);

+  else

+    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);

-      for (j = 0; j < 4; j++) {

-        const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;

+  xd->corrupted |= vp9_reader_has_error(r);

+}

-        if (mb_row + y_idx_sb >= pc->mb_rows ||

-            mb_col + x_idx_sb >= pc->mb_cols) {

-          // MB lies outside frame, skip on to next

-          continue;

-        }

+static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,

+                            vp9_reader* r, BLOCK_SIZE_TYPE bsize) {

+  VP9_COMMON *const pc = &pbi->common;

+  MACROBLOCKD *const xd = &pbi->mb;

+  int bsl = mi_width_log2(bsize), bs = (1 << bsl) / 2;

+  int n;

+  PARTITION_TYPE partition = PARTITION_NONE;

+  BLOCK_SIZE_TYPE subsize;

-        xd->sb_index = j;

+  if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)

+    return;

-        if (vp9_read(bc, pc->sb32_coded)) {

-#ifdef DEC_DEBUG

-          dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&

-                       mb_row + y_idx_sb == 8 && mb_col + x_idx_sb == 0);

-          if (dec_debug)

-            printf("Debug Decode SB32\n");

-#endif

-          set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);

-          vp9_decode_mb_mode_mv(pbi,

-                                xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);

-          set_refs(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);

-          decode_superblock32(pbi,

-                              xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);

-          xd->corrupted |= bool_error(bc);

-        } else {

-          int i;

+  if (bsize < BLOCK_SIZE_SB8X8)

+    if (xd->ab_index != 0)

+      return;

-          // Process the 4 MBs within the SB in the order:

-          // top-left, top-right, bottom-left, bottom-right

-          for (i = 0; i < 4; i++) {

-            const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);

+  if (bsize >= BLOCK_SIZE_SB8X8) {

+    int pl;

+    int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);

+    // read the partition information

+    xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);

+    xd->above_seg_context = pc->above_seg_context + mi_col;

+    pl = partition_plane_context(xd, bsize);

-            if (mb_row + y_idx >= pc->mb_rows ||

-                mb_col + x_idx >= pc->mb_cols) {

-              // MB lies outside frame, skip on to next

-              continue;

-            }

-#ifdef DEC_DEBUG

-            dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&

-                         mb_row + y_idx == 8 && mb_col + x_idx == 0);

-            if (dec_debug)

-              printf("Debug Decode MB\n");

-#endif

+    if (idx == 0)

+      partition = treed_read(r, vp9_partition_tree,

+                             pc->fc.partition_prob[pc->frame_type][pl]);

+    else if (idx > 0 &&

+        !vp9_read(r, pc->fc.partition_prob[pc->frame_type][pl][idx]))

+      partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;

+    else

+      partition = PARTITION_SPLIT;

-            set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);

-            xd->mb_index = i;

-            vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);

-            set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);

-            decode_macroblock(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);

+    pc->fc.partition_counts[pl][partition]++;

+  }

-            /* check if the boolean decoder has suffered an error */

-            xd->corrupted |= bool_error(bc);

-          }

-        }

+  subsize = get_subsize(bsize, partition);

+  *(get_sb_index(xd, subsize)) = 0;

+  switch (partition) {

+    case PARTITION_NONE:

+      decode_modes_b(pbi, mi_row, mi_col, r, subsize);

+      break;

+    case PARTITION_HORZ:

+      decode_modes_b(pbi, mi_row, mi_col, r, subsize);

+      *(get_sb_index(xd, subsize)) = 1;

+      if (mi_row + bs < pc->mi_rows)

+        decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);

+      break;

+    case PARTITION_VERT:

+      decode_modes_b(pbi, mi_row, mi_col, r, subsize);

+      *(get_sb_index(xd, subsize)) = 1;

+      if (mi_col + bs < pc->mi_cols)

+        decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);

+      break;

+    case PARTITION_SPLIT:

+      for (n = 0; n < 4; n++) {

+        int j = n >> 1, i = n & 0x01;

+        *(get_sb_index(xd, subsize)) = n;

+        decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);

-    }

+      break;

+    default:

+      assert(0);

+  // update partition context

+  if (bsize >= BLOCK_SIZE_SB8X8 &&

+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {

+    set_partition_seg_context(pc, xd, mi_row, mi_col);

+    update_partition_context(xd, subsize, bsize);

+  }

 static void setup_token_decoder(VP9D_COMP *pbi,

-                                const unsigned char *cx_data,

-                                BOOL_DECODER* const bool_decoder) {

+                                const uint8_t *data, size_t read_size,

+                                vp9_reader *r) {

   VP9_COMMON *pc = &pbi->common;

-  const unsigned char *user_data_end = pbi->Source + pbi->source_sz;

-  const unsigned char *partition = cx_data;

-  ptrdiff_t bytes_left = user_data_end - partition;

-  ptrdiff_t partition_size = bytes_left;

+  const uint8_t *data_end = pbi->source + pbi->source_sz;

   // Validate the calculated partition length. If the buffer

   // described by the partition can't be fully read, then restrict

   // it to the portion that can be (for EC mode) or throw an error.

-  if (!read_is_valid(partition, partition_size, user_data_end)) {

+  if (!read_is_valid(data, read_size, data_end))

     vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                       "Truncated packet or corrupt partition "

-                       "%d length", 1);

-  }

+                       "Truncated packet or corrupt tile length");

-  if (vp9_start_decode(bool_decoder,

-                       partition, (unsigned int)partition_size))

+  if (vp9_reader_init(r, data, read_size))

     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate bool decoder %d", 1);

-static void init_frame(VP9D_COMP *pbi) {

-  VP9_COMMON *const pc = &pbi->common;

-  MACROBLOCKD *const xd = &pbi->mb;

+static void read_coef_probs_common(FRAME_CONTEXT *fc, TX_SIZE tx_size,

+                                   vp9_reader *r) {

+  const int entropy_nodes_update = UNCONSTRAINED_NODES;

+  vp9_coeff_probs_model *coef_probs = fc->coef_probs[tx_size];

-  if (pc->frame_type == KEY_FRAME) {

-    vp9_setup_past_independence(pc, xd);

-    // All buffers are implicitly updated on key frames.

-    pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1;

-  } else if (pc->error_resilient_mode) {

-    vp9_setup_past_independence(pc, xd);

-  }

-  if (pc->frame_type != KEY_FRAME) {

-    pc->mcomp_filter_type = pc->use_bilinear_mc_filter ? BILINEAR : EIGHTTAP;

-    // To enable choice of different interpolation filters

-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

-  }

-  xd->mode_info_context = pc->mi;

-  xd->prev_mode_info_context = pc->prev_mi;

-  xd->frame_type = pc->frame_type;

-  xd->mode_info_context->mbmi.mode = DC_PRED;

-  xd->mode_info_stride = pc->mode_info_stride;

-  xd->corrupted = 0;

-  xd->fullpixel_mask = pc->full_pixel ? 0xfffffff8 : 0xffffffff;

-}

-#if CONFIG_CODE_NONZEROCOUNT

-static void read_nzc_probs_common(VP9_COMMON *cm,

-                                  BOOL_DECODER* const bc,

-                                  int block_size) {

-  int c, r, b, t;

-  int tokens, nodes;

-  vp9_prob *nzc_probs;

-  vp9_prob upd;

-  if (!vp9_read_bit(bc)) return;

-  if (block_size == 32) {

-    tokens = NZC32X32_TOKENS;

-    nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];

-    upd = NZC_UPDATE_PROB_32X32;

-  } else if (block_size == 16) {

-    tokens = NZC16X16_TOKENS;

-    nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];

-    upd = NZC_UPDATE_PROB_16X16;

-  } else if (block_size == 8) {

-    tokens = NZC8X8_TOKENS;

-    nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];

-    upd = NZC_UPDATE_PROB_8X8;

-  } else {

-    tokens = NZC4X4_TOKENS;

-    nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];

-    upd = NZC_UPDATE_PROB_4X4;

-  }

-  nodes = tokens - 1;

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

-        int offset_nodes = offset * nodes;

-        for (t = 0; t < nodes; ++t) {

-          vp9_prob *p = &nzc_probs[offset_nodes + t];

-          if (vp9_read(bc, upd)) {

-            *p = read_prob_diff_update(bc, *p);

-          }

-        }

-      }

-    }

-  }

-}

-static void read_nzc_pcat_probs(VP9_COMMON *cm, BOOL_DECODER* const bc) {

-  int c, t, b;

-  vp9_prob upd = NZC_UPDATE_PROB_PCAT;

-  if (!vp9_read_bit(bc)) {

-    return;

-  }

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

-      for (b = 0; b < bits; ++b) {

-        vp9_prob *p = &cm->fc.nzc_pcat_probs[c][t][b];

-        if (vp9_read(bc, upd)) {

-          *p = read_prob_diff_update(bc, *p);

-        }

-      }

-    }

-  }

-}

-static void read_nzc_probs(VP9_COMMON *cm,

-                           BOOL_DECODER* const bc) {

-  read_nzc_probs_common(cm, bc, 4);

-  if (cm->txfm_mode != ONLY_4X4)

-    read_nzc_probs_common(cm, bc, 8);

-  if (cm->txfm_mode > ALLOW_8X8)

-    read_nzc_probs_common(cm, bc, 16);

-  if (cm->txfm_mode > ALLOW_16X16)

-    read_nzc_probs_common(cm, bc, 32);

-#ifdef NZC_PCAT_UPDATE

-  read_nzc_pcat_probs(cm, bc);

-#endif

-}

-#endif  // CONFIG_CODE_NONZEROCOUNT

-static void read_coef_probs_common(BOOL_DECODER* const bc,

-                                   vp9_coeff_probs *coef_probs,

-                                   int block_types) {

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

-  const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES;

-#else

-  const int entropy_nodes_update = ENTROPY_NODES;

-#endif

   int i, j, k, l, m;

-  if (vp9_read_bit(bc)) {

-    for (i = 0; i < block_types; i++) {

+  if (vp9_read_bit(r)) {

+    for (i = 0; i < BLOCK_TYPES; i++) {

       for (j = 0; j < REF_TYPES; j++) {

         for (k = 0; k < COEF_BANDS; k++) {

           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {

+            const int mstart = 0;

             if (l >= 3 && k == 0)

               continue;

-            for (m = CONFIG_CODE_NONZEROCOUNT; m < entropy_nodes_update; m++) {

+            for (m = mstart; m < entropy_nodes_update; m++) {

               vp9_prob *const p = coef_probs[i][j][k][l] + m;

-              if (vp9_read(bc, vp9_coef_update_prob[m])) {

-                *p = read_prob_diff_update(bc, *p);

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

-                if (m == UNCONSTRAINED_NODES - 1)

-                  vp9_get_model_distribution(*p, coef_probs[i][j][k][l], i, j);

-#endif

+              if (vp9_read(r, vp9_coef_update_prob[m])) {

+                *p = vp9_read_prob_diff_update(r, *p);

@@ -1242,159 +586,104 @@

-static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {

-  VP9_COMMON *const pc = &pbi->common;

+static void read_coef_probs(VP9D_COMP *pbi, vp9_reader *r) {

+  const TXFM_MODE txfm_mode = pbi->common.txfm_mode;

+  FRAME_CONTEXT *const fc = &pbi->common.fc;

-  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES);

+  read_coef_probs_common(fc, TX_4X4, r);

-  if (pbi->common.txfm_mode != ONLY_4X4)

-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES);

+  if (txfm_mode > ONLY_4X4)

+    read_coef_probs_common(fc, TX_8X8, r);

-  if (pbi->common.txfm_mode > ALLOW_8X8)

-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES);

+  if (txfm_mode > ALLOW_8X8)

+    read_coef_probs_common(fc, TX_16X16, r);

-  if (pbi->common.txfm_mode > ALLOW_16X16)

-    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES);

+  if (txfm_mode > ALLOW_16X16)

+    read_coef_probs_common(fc, TX_32X32, r);

-static void update_frame_size(VP9D_COMP *pbi) {

-  VP9_COMMON *cm = &pbi->common;

-  /* our internal buffers are always multiples of 16 */

-  const int width = (cm->width + 15) & ~15;

-  const int height = (cm->height + 15) & ~15;

-  cm->mb_rows = height >> 4;

-  cm->mb_cols = width >> 4;

-  cm->MBs = cm->mb_rows * cm->mb_cols;

-  cm->mode_info_stride = cm->mb_cols + 1;

-  memset(cm->mip, 0,

-        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));

-  vp9_update_mode_info_border(cm, cm->mip);

-  cm->mi = cm->mip + cm->mode_info_stride + 1;

-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;

-  vp9_update_mode_info_in_image(cm, cm->mi);

-}

-static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {

+static void setup_segmentation(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {

   int i, j;

-  xd->segmentation_enabled = vp9_read_bit(r);

-  if (xd->segmentation_enabled) {

-    // Read whether or not the segmentation map is being explicitly updated

-    // this frame.

-    xd->update_mb_segmentation_map = vp9_read_bit(r);

+  VP9_COMMON *const cm = &pbi->common;

+  MACROBLOCKD *const xd = &pbi->mb;

-    // If so what method will be used.

-    if (xd->update_mb_segmentation_map) {

-      // Which macro block level features are enabled. Read the probs used to

-      // decode the segment id for each macro block.

-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

-        xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r) : 255;

-      }

+  xd->update_mb_segmentation_map = 0;

+  xd->update_mb_segmentation_data = 0;

-      // Read the prediction probs needed to decode the segment id

-      pc->temporal_update = vp9_read_bit(r);

-      for (i = 0; i < PREDICTION_PROBS; i++) {

-        pc->segment_pred_probs[i] = pc->temporal_update

-            ? (vp9_read_bit(r) ? vp9_read_prob(r) : 255)

-            : 255;

-      }

+  xd->segmentation_enabled = vp9_rb_read_bit(rb);

+  if (!xd->segmentation_enabled)

+    return;

-      if (pc->temporal_update) {

-        const vp9_prob *p = xd->mb_segment_tree_probs;

-        vp9_prob *p_mod = xd->mb_segment_mispred_tree_probs;

-        const int c0 =        p[0]  *        p[1];

-        const int c1 =        p[0]  * (256 - p[1]);

-        const int c2 = (256 - p[0]) *        p[2];

-        const int c3 = (256 - p[0]) * (256 - p[2]);

+  // Segmentation map update

+  xd->update_mb_segmentation_map = vp9_rb_read_bit(rb);

+  if (xd->update_mb_segmentation_map) {

+    for (i = 0; i < MB_SEG_TREE_PROBS; i++)

+      xd->mb_segment_tree_probs[i] = vp9_rb_read_bit(rb) ?

+                                         vp9_rb_read_literal(rb, 8) : MAX_PROB;

-        p_mod[0] = get_binary_prob(c1, c2 + c3);

-        p_mod[1] = get_binary_prob(c0, c2 + c3);

-        p_mod[2] = get_binary_prob(c0 + c1, c3);

-        p_mod[3] = get_binary_prob(c0 + c1, c2);

-      }

+    cm->temporal_update = vp9_rb_read_bit(rb);

+    if (cm->temporal_update) {

+      for (i = 0; i < PREDICTION_PROBS; i++)

+        cm->segment_pred_probs[i] = vp9_rb_read_bit(rb) ?

+                                        vp9_rb_read_literal(rb, 8) : MAX_PROB;

+    } else {

+      for (i = 0; i < PREDICTION_PROBS; i++)

+        cm->segment_pred_probs[i] = MAX_PROB;

+  }

-    xd->update_mb_segmentation_data = vp9_read_bit(r);

-    if (xd->update_mb_segmentation_data) {

-      int data;

+  // Segmentation data update

+  xd->update_mb_segmentation_data = vp9_rb_read_bit(rb);

+  if (xd->update_mb_segmentation_data) {

+    xd->mb_segment_abs_delta = vp9_rb_read_bit(rb);

-      xd->mb_segment_abs_delta = vp9_read_bit(r);

+    vp9_clearall_segfeatures(xd);

-      vp9_clearall_segfeatures(xd);

-      // For each segmentation...

-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-        // For each of the segments features...

-        for (j = 0; j < SEG_LVL_MAX; j++) {

-          // Is the feature enabled

-          if (vp9_read_bit(r)) {

-            // Update the feature data and mask

-            vp9_enable_segfeature(xd, i, j);

-            data = vp9_decode_unsigned_max(r, vp9_seg_feature_data_max(j));

-            // Is the segment data signed..

-            if (vp9_is_segfeature_signed(j)) {

-              if (vp9_read_bit(r))

-                data = -data;

-            }

-          } else {

-            data = 0;

-          }

-          vp9_set_segdata(xd, i, j, data);

+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+      for (j = 0; j < SEG_LVL_MAX; j++) {

+        int data = 0;

+        const int feature_enabled = vp9_rb_read_bit(rb);

+        if (feature_enabled) {

+          vp9_enable_segfeature(xd, i, j);

+          data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));

+          if (vp9_is_segfeature_signed(j))

+            data = vp9_rb_read_bit(rb) ? -data : data;

+        vp9_set_segdata(xd, i, j, data);

-static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {

-  int i;

+static void setup_loopfilter(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {

+  VP9_COMMON *const cm = &pbi->common;

+  MACROBLOCKD *const xd = &pbi->mb;

-  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(r);

-  pc->filter_level = vp9_read_literal(r, 6);

-  pc->sharpness_level = vp9_read_literal(r, 3);

+  cm->filter_level = vp9_rb_read_literal(rb, 6);

+  cm->sharpness_level = vp9_rb_read_literal(rb, 3);

-#if CONFIG_LOOP_DERING

-  if (vp9_read_bit(r))

-    pc->dering_enabled = 1 + vp9_read_literal(r, 4);

-  else

-    pc->dering_enabled = 0;

-#endif

   // Read in loop filter deltas applied at the MB level based on mode or ref

   // frame.

   xd->mode_ref_lf_delta_update = 0;

-  xd->mode_ref_lf_delta_enabled = vp9_read_bit(r);

+  xd->mode_ref_lf_delta_enabled = vp9_rb_read_bit(rb);

   if (xd->mode_ref_lf_delta_enabled) {

-    // Do the deltas need to be updated

-    xd->mode_ref_lf_delta_update = vp9_read_bit(r);

+    xd->mode_ref_lf_delta_update = vp9_rb_read_bit(rb);

     if (xd->mode_ref_lf_delta_update) {

-      // Send update

-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

-        if (vp9_read_bit(r)) {

-          // sign = vp9_read_bit(r);

-          xd->ref_lf_deltas[i] = vp9_read_literal(r, 6);

+      int i;

-          if (vp9_read_bit(r))

-            xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i];  // Apply sign

+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

+        if (vp9_rb_read_bit(rb)) {

+          const int value = vp9_rb_read_literal(rb, 6);

+          xd->ref_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;

-      // Send update

       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

-        if (vp9_read_bit(r)) {

-          // sign = vp9_read_bit(r);

-          xd->mode_lf_deltas[i] = vp9_read_literal(r, 6);

-          if (vp9_read_bit(r))

-            xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i];  // Apply sign

+        if (vp9_rb_read_bit(rb)) {

+          const int value = vp9_rb_read_literal(rb, 6);

+          xd->mode_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;

@@ -1401,168 +690,234 @@

-static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active,

-                                      const uint8_t *data,

-                                      const uint8_t *data_end) {

-  VP9_COMMON *const pc = &pbi->common;

-  const int width = pc->width;

-  const int height = pc->height;

-  // If error concealment is enabled we should only parse the new size

-  // if we have enough data. Otherwise we will end up with the wrong size.

-  if (scaling_active && data + 4 < data_end) {

-    pc->display_width = read_le16(data + 0);

-    pc->display_height = read_le16(data + 2);

-    data += 4;

+static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {

+  const int old = *delta_q;

+  if (vp9_rb_read_bit(rb)) {

+    const int value = vp9_rb_read_literal(rb, 4);

+    *delta_q = vp9_rb_read_bit(rb) ? -value : value;

+  return old != *delta_q;

+}

-  if (data + 4 < data_end) {

-    pc->width = read_le16(data + 0);

-    pc->height = read_le16(data + 2);

-    data += 4;

-  }

+static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {

+  MACROBLOCKD *const xd = &pbi->mb;

+  VP9_COMMON *const cm = &pbi->common;

+  int update = 0;

-  if (!scaling_active) {

-    pc->display_width = pc->width;

-    pc->display_height = pc->height;

+  cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);

+  update |= read_delta_q(rb, &cm->y_dc_delta_q);

+  update |= read_delta_q(rb, &cm->uv_dc_delta_q);

+  update |= read_delta_q(rb, &cm->uv_ac_delta_q);

+  if (update)

+    vp9_init_dequantizer(cm);

+  xd->lossless = cm->base_qindex == 0 &&

+                 cm->y_dc_delta_q == 0 &&

+                 cm->uv_dc_delta_q == 0 &&

+                 cm->uv_ac_delta_q == 0;

+  if (xd->lossless) {

+    xd->itxm_add          = vp9_idct_add_lossless_c;

+    xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;

+    xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;

+  } else {

+    xd->itxm_add          = vp9_idct_add;

+    xd->itxm_add_y_block  = vp9_idct_add_y_block;

+    xd->itxm_add_uv_block = vp9_idct_add_uv_block;

+}

-  if (width != pc->width || height != pc->height) {

-    if (pc->width <= 0) {

-      pc->width = width;

-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                         "Invalid frame width");

-    }

+static INTERPOLATIONFILTERTYPE read_interp_filter_type(

+    struct vp9_read_bit_buffer *rb) {

+  return vp9_rb_read_bit(rb) ? SWITCHABLE

+                             : vp9_rb_read_literal(rb, 2);

+}

-    if (pc->height <= 0) {

-      pc->height = height;

-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                         "Invalid frame height");

-    }

+static void read_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb,

+                            int *width, int *height) {

+  const int w = vp9_rb_read_literal(rb, 16) + 1;

+  const int h = vp9_rb_read_literal(rb, 16) + 1;

+  *width = w;

+  *height = h;

+}

+static void setup_display_size(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {

+  VP9_COMMON *const cm = &pbi->common;

+  cm->display_width = cm->width;

+  cm->display_height = cm->height;

+  if (vp9_rb_read_bit(rb))

+    read_frame_size(cm, rb, &cm->display_width, &cm->display_height);

+}

+static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {

+  VP9_COMMON *cm = &pbi->common;

+  if (cm->width != width || cm->height != height) {

     if (!pbi->initial_width || !pbi->initial_height) {

-      if (vp9_alloc_frame_buffers(pc, pc->width, pc->height))

-        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

+      if (vp9_alloc_frame_buffers(cm, width, height))

+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,

                            "Failed to allocate frame buffers");

-      pbi->initial_width = pc->width;

-      pbi->initial_height = pc->height;

-    }

+      pbi->initial_width = width;

+      pbi->initial_height = height;

+    } else {

+      if (width > pbi->initial_width)

+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,

+                           "Frame width too large");

-    if (pc->width > pbi->initial_width) {

-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                         "Frame width too large");

+      if (height > pbi->initial_height)

+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,

+                           "Frame height too large");

-    if (pc->height > pbi->initial_height) {

-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                         "Frame height too large");

-    }

+    cm->width = width;

+    cm->height = height;

-    update_frame_size(pbi);

+    vp9_update_frame_size(cm);

-  return data;

+  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height,

+                           cm->subsampling_x, cm->subsampling_y,

+                           VP9BORDERINPIXELS);

-static void update_frame_context(VP9D_COMP *pbi, vp9_reader *r) {

-  FRAME_CONTEXT *const fc = &pbi->common.fc;

+static void setup_frame_size(VP9D_COMP *pbi,

+                             struct vp9_read_bit_buffer *rb) {

+  VP9_COMMON *const cm = &pbi->common;

+  int width, height;

+  read_frame_size(cm, rb, &width, &height);

+  setup_display_size(pbi, rb);

+  apply_frame_size(pbi, width, height);

+}

-  vp9_copy(fc->pre_coef_probs_4x4, fc->coef_probs_4x4);

-  vp9_copy(fc->pre_coef_probs_8x8, fc->coef_probs_8x8);

-  vp9_copy(fc->pre_coef_probs_16x16, fc->coef_probs_16x16);

-  vp9_copy(fc->pre_coef_probs_32x32, fc->coef_probs_32x32);

-  vp9_copy(fc->pre_ymode_prob, fc->ymode_prob);

-  vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob);

-  vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);

-  vp9_copy(fc->pre_bmode_prob, fc->bmode_prob);

-  vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob);

-  vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob);

-  vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob);

+static void setup_frame_size_with_refs(VP9D_COMP *pbi,

+                                       struct vp9_read_bit_buffer *rb) {

+  VP9_COMMON *const cm = &pbi->common;

+  int width, height;

+  int found = 0, i;

+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {

+    if (vp9_rb_read_bit(rb)) {

+      YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]];

+      width = cfg->y_crop_width;

+      height = cfg->y_crop_height;

+      found = 1;

+      break;

+    }

+  }

+  if (!found)

+    read_frame_size(cm, rb, &width, &height);

+  if (!width || !height)

+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,

+                       "Referenced frame with invalid size");

+  setup_display_size(pbi, rb);

+  apply_frame_size(pbi, width, height);

+}

+static void update_frame_context(FRAME_CONTEXT *fc) {

+  vp9_copy(fc->pre_coef_probs, fc->coef_probs);

+  vp9_copy(fc->pre_y_mode_prob, fc->y_mode_prob);

+  vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);

+  vp9_copy(fc->pre_partition_prob, fc->partition_prob[1]);

+  vp9_copy(fc->pre_intra_inter_prob, fc->intra_inter_prob);

+  vp9_copy(fc->pre_comp_inter_prob, fc->comp_inter_prob);

+  vp9_copy(fc->pre_single_ref_prob, fc->single_ref_prob);

+  vp9_copy(fc->pre_comp_ref_prob, fc->comp_ref_prob);

   fc->pre_nmvc = fc->nmvc;

+  vp9_copy(fc->pre_switchable_interp_prob, fc->switchable_interp_prob);

+  vp9_copy(fc->pre_inter_mode_probs, fc->inter_mode_probs);

+  vp9_copy(fc->pre_tx_probs_8x8p, fc->tx_probs_8x8p);

+  vp9_copy(fc->pre_tx_probs_16x16p, fc->tx_probs_16x16p);

+  vp9_copy(fc->pre_tx_probs_32x32p, fc->tx_probs_32x32p);

+  vp9_copy(fc->pre_mbskip_probs, fc->mbskip_probs);

-  vp9_zero(fc->coef_counts_4x4);

-  vp9_zero(fc->coef_counts_8x8);

-  vp9_zero(fc->coef_counts_16x16);

-  vp9_zero(fc->coef_counts_32x32);

+  vp9_zero(fc->coef_counts);

   vp9_zero(fc->eob_branch_counts);

-  vp9_zero(fc->ymode_counts);

-  vp9_zero(fc->sb_ymode_counts);

+  vp9_zero(fc->y_mode_counts);

   vp9_zero(fc->uv_mode_counts);

-  vp9_zero(fc->bmode_counts);

-  vp9_zero(fc->i8x8_mode_counts);

-  vp9_zero(fc->sub_mv_ref_counts);

-  vp9_zero(fc->mbsplit_counts);

   vp9_zero(fc->NMVcount);

-  vp9_zero(fc->mv_ref_ct);

+  vp9_zero(fc->inter_mode_counts);

+  vp9_zero(fc->partition_counts);

+  vp9_zero(fc->switchable_interp_count);

+  vp9_zero(fc->intra_inter_count);

+  vp9_zero(fc->comp_inter_count);

+  vp9_zero(fc->single_ref_count);

+  vp9_zero(fc->comp_ref_count);

+  vp9_zero(fc->tx_count_8x8p);

+  vp9_zero(fc->tx_count_16x16p);

+  vp9_zero(fc->tx_count_32x32p);

+  vp9_zero(fc->mbskip_count);

+}

-#if CONFIG_COMP_INTERINTRA_PRED

-  fc->pre_interintra_prob = fc->interintra_prob;

-  vp9_zero(fc->interintra_counts);

-#endif

+static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {

+  VP9_COMMON *const pc = &pbi->common;

+  int mi_row, mi_col;

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_copy(fc->pre_nzc_probs_4x4, fc->nzc_probs_4x4);

-  vp9_copy(fc->pre_nzc_probs_8x8, fc->nzc_probs_8x8);

-  vp9_copy(fc->pre_nzc_probs_16x16, fc->nzc_probs_16x16);

-  vp9_copy(fc->pre_nzc_probs_32x32, fc->nzc_probs_32x32);

-  vp9_copy(fc->pre_nzc_pcat_probs, fc->nzc_pcat_probs);

+  for (mi_row = pc->cur_tile_mi_row_start;

+       mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {

+    // For a SB there are 2 left contexts, each pertaining to a MB row within

+    vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));

+    vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));

+    for (mi_col = pc->cur_tile_mi_col_start;

+         mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)

+      decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);

+  }

+}

-  vp9_zero(fc->nzc_counts_4x4);

-  vp9_zero(fc->nzc_counts_8x8);

-  vp9_zero(fc->nzc_counts_16x16);

-  vp9_zero(fc->nzc_counts_32x32);

-  vp9_zero(fc->nzc_pcat_counts);

-#endif

+static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {

+  int delta_log2_tiles;

-  read_coef_probs(pbi, r);

-#if CONFIG_CODE_NONZEROCOUNT

-  read_nzc_probs(&pbi->common, r);

-#endif

+  vp9_get_tile_n_bits(cm, &cm->log2_tile_columns, &delta_log2_tiles);

+  while (delta_log2_tiles--) {

+    if (vp9_rb_read_bit(rb)) {

+      cm->log2_tile_columns++;

+    } else {

+      break;

+    }

+  }

+  cm->log2_tile_rows = vp9_rb_read_bit(rb);

+  if (cm->log2_tile_rows)

+    cm->log2_tile_rows += vp9_rb_read_bit(rb);

+  cm->tile_columns = 1 << cm->log2_tile_columns;

+  cm->tile_rows    = 1 << cm->log2_tile_rows;

 static void decode_tiles(VP9D_COMP *pbi,

-                         const uint8_t *data, int first_partition_size,

-                         BOOL_DECODER *header_bc, BOOL_DECODER *residual_bc) {

+                         const uint8_t *data, size_t first_partition_size,

+                         vp9_reader *residual_bc) {

   VP9_COMMON *const pc = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

   const uint8_t *data_ptr = data + first_partition_size;

-  int tile_row, tile_col, delta_log2_tiles;

-  int mb_row;

+  const uint8_t* const data_end = pbi->source + pbi->source_sz;

+  int tile_row, tile_col;

-  vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles);

-  while (delta_log2_tiles--) {

-    if (vp9_read_bit(header_bc)) {

-      pc->log2_tile_columns++;

-    } else {

-      break;

-    }

-  }

-  pc->log2_tile_rows = vp9_read_bit(header_bc);

-  if (pc->log2_tile_rows)

-    pc->log2_tile_rows += vp9_read_bit(header_bc);

-  pc->tile_columns = 1 << pc->log2_tile_columns;

-  pc->tile_rows    = 1 << pc->log2_tile_rows;

+  // Note: this memset assumes above_context[0], [1] and [2]

+  // are allocated as part of the same buffer.

+  vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *

+                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(pc));

-  vpx_memset(pc->above_context, 0,

-             sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);

+  vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *

+                                       mi_cols_aligned_to_sb(pc));

   if (pbi->oxcf.inv_tile_order) {

     const int n_cols = pc->tile_columns;

     const uint8_t *data_ptr2[4][1 << 6];

-    BOOL_DECODER bc_bak = {0};

+    vp9_reader bc_bak = {0};

     // pre-initialize the offsets, we're going to read in inverse order

     data_ptr2[0][0] = data_ptr;

     for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {

       if (tile_row) {

-        const int size = read_le32(data_ptr2[tile_row - 1][n_cols - 1]);

+        const int size = read_be32(data_ptr2[tile_row - 1][n_cols - 1]);

         data_ptr2[tile_row - 1][n_cols - 1] += 4;

         data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;

       for (tile_col = 1; tile_col < n_cols; tile_col++) {

-        const int size = read_le32(data_ptr2[tile_row][tile_col - 1]);

+        const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);

         data_ptr2[tile_row][tile_col - 1] += 4;

         data_ptr2[tile_row][tile_col] =

             data_ptr2[tile_row][tile_col - 1] + size;

@@ -1573,14 +928,10 @@

       vp9_get_tile_row_offsets(pc, tile_row);

       for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) {

         vp9_get_tile_col_offsets(pc, tile_col);

-        setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], residual_bc);

-        // Decode a row of superblocks

-        for (mb_row = pc->cur_tile_mb_row_start;

-             mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {

-          decode_sb_row(pbi, pc, mb_row, xd, residual_bc);

-        }

+        setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],

+                            data_end - data_ptr2[tile_row][tile_col],

+                            residual_bc);

+        decode_tile(pbi, residual_bc);

         if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1)

           bc_bak = *residual_bc;

@@ -1592,333 +943,295 @@

     for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {

       vp9_get_tile_row_offsets(pc, tile_row);

       for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {

+        size_t size;

         vp9_get_tile_col_offsets(pc, tile_col);

         has_more = tile_col < pc->tile_columns - 1 ||

                    tile_row < pc->tile_rows - 1;

+        if (has_more) {

+          if (!read_is_valid(data_ptr, 4, data_end))

+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                         "Truncated packet or corrupt tile length");

-        // Setup decoder

-        setup_token_decoder(pbi, data_ptr + (has_more ? 4 : 0), residual_bc);

-        // Decode a row of superblocks

-        for (mb_row = pc->cur_tile_mb_row_start;

-             mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {

-          decode_sb_row(pbi, pc, mb_row, xd, residual_bc);

+          size = read_be32(data_ptr);

+          data_ptr += 4;

+        } else {

+          size = data_end - data_ptr;

-        if (has_more) {

-          const int size = read_le32(data_ptr);

-          data_ptr += 4 + size;

-        }

+        setup_token_decoder(pbi, data_ptr, size, residual_bc);

+        decode_tile(pbi, residual_bc);

+        data_ptr += size;

-int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {

-  BOOL_DECODER header_bc, residual_bc;

-  VP9_COMMON *const pc = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  const uint8_t *data = (const uint8_t *)pbi->Source;

-  const uint8_t *data_end = data + pbi->source_sz;

-  ptrdiff_t first_partition_length_in_bytes = 0;

-  int i, corrupt_tokens = 0;

+static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {

+  if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 ||

+      vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 ||

+      vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) {

+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,

+                       "Invalid frame sync code");

+  }

+}

-  // printf("Decoding frame %d\n", pc->current_video_frame);

+static void error_handler(void *data, int bit_offset) {

+  VP9_COMMON *const cm = (VP9_COMMON *)data;

+  vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");

+}

-  xd->corrupted = 0;  // start with no corruption of current frame

-  pc->yv12_fb[pc->new_fb_idx].corrupted = 0;

+static void setup_inter_inter(VP9_COMMON *cm) {

+  int i;

-  if (data_end - data < 3) {

-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");

-  } else {

-    int scaling_active;

-    pc->last_frame_type = pc->frame_type;

-    pc->frame_type = (FRAME_TYPE)(data[0] & 1);

-    pc->version = (data[0] >> 1) & 7;

-    pc->show_frame = (data[0] >> 4) & 1;

-    scaling_active = (data[0] >> 5) & 1;

-    first_partition_length_in_bytes = read_le16(data + 1);

+  cm->allow_comp_inter_inter = 0;

+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {

+    cm->allow_comp_inter_inter |= i > 0 &&

+        cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];

+  }

-    if (!read_is_valid(data, first_partition_length_in_bytes, data_end))

-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                         "Truncated packet or corrupt partition 0 length");

+  if (cm->allow_comp_inter_inter) {

+    // which one is always-on in comp inter-inter?

+    if (cm->ref_frame_sign_bias[LAST_FRAME] ==

+        cm->ref_frame_sign_bias[GOLDEN_FRAME]) {

+      cm->comp_fixed_ref = ALTREF_FRAME;

+      cm->comp_var_ref[0] = LAST_FRAME;

+      cm->comp_var_ref[1] = GOLDEN_FRAME;

+    } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==

+               cm->ref_frame_sign_bias[ALTREF_FRAME]) {

+      cm->comp_fixed_ref = GOLDEN_FRAME;

+      cm->comp_var_ref[0] = LAST_FRAME;

+      cm->comp_var_ref[1] = ALTREF_FRAME;

+    } else {

+      cm->comp_fixed_ref = LAST_FRAME;

+      cm->comp_var_ref[0] = GOLDEN_FRAME;

+      cm->comp_var_ref[1] = ALTREF_FRAME;

+    }

+  }

+}

-    data += 3;

+#define RESERVED \

+  if (vp9_rb_read_bit(rb)) \

+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \

+                         "Reserved bit must be unset")

-    vp9_setup_version(pc);

+static size_t read_uncompressed_header(VP9D_COMP *pbi,

+                                       struct vp9_read_bit_buffer *rb) {

+  VP9_COMMON *const cm = &pbi->common;

+  MACROBLOCKD *const xd = &pbi->mb;

+  int i;

-    if (pc->frame_type == KEY_FRAME) {

-      // When error concealment is enabled we should only check the sync

-      // code if we have enough bits available

-      if (data + 3 < data_end) {

-        if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)

-          vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,

-                             "Invalid frame sync code");

-      }

-      data += 3;

-    }

+  cm->last_frame_type = cm->frame_type;

-    data = setup_frame_size(pbi, scaling_active, data, data_end);

-  }

+  if (vp9_rb_read_literal(rb, 2) != 0x2)

+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,

+                         "Invalid frame marker");

-  if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||

-      pc->width == 0 || pc->height == 0) {

-    return -1;

+  cm->version = vp9_rb_read_bit(rb);

+  RESERVED;

+  if (vp9_rb_read_bit(rb)) {

+    // show an existing frame directly

+    int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];

+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show);

+    pbi->refresh_frame_flags = 0;

+    cm->filter_level = 0;

+    return 0;

-  init_frame(pbi);

+  cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb);

+  cm->show_frame = vp9_rb_read_bit(rb);

+  cm->error_resilient_mode = vp9_rb_read_bit(rb);

-  // Reset the frame pointers to the current frame size

-  vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],

-                                pc->width, pc->height,

-                                VP9BORDERINPIXELS);

+  if (cm->frame_type == KEY_FRAME) {

+    int csp;

-  if (vp9_start_decode(&header_bc, data,

-                       (unsigned int)first_partition_length_in_bytes))

-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate bool decoder 0");

+    check_sync_code(cm, rb);

-  pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);

-  pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);

-  pc->error_resilient_mode = vp9_read_bit(&header_bc);

+    csp = vp9_rb_read_literal(rb, 3);  // colorspace

+    if (csp != 7) {  // != sRGB

+      vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range

+      if (cm->version == 1) {

+        cm->subsampling_x = vp9_rb_read_bit(rb);

+        cm->subsampling_y = vp9_rb_read_bit(rb);

+        vp9_rb_read_bit(rb);  // has extra plane

+      } else {

+        cm->subsampling_y = cm->subsampling_x = 1;

+      }

+    } else {

+      if (cm->version == 1) {

+        cm->subsampling_y = cm->subsampling_x = 0;

+        vp9_rb_read_bit(rb);  // has extra plane

+      } else {

+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,

+                           "RGB not supported in profile 0");

+      }

+    }

-  setup_segmentation(pc, xd, &header_bc);

+    pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1;

-  // Read common prediction model status flag probability updates for the

-  // reference frame

-  if (pc->frame_type == KEY_FRAME) {

-    // Set the prediction probabilities to defaults

-    pc->ref_pred_probs[0] = 120;

-    pc->ref_pred_probs[1] = 80;

-    pc->ref_pred_probs[2] = 40;

-  } else {

-    for (i = 0; i < PREDICTION_PROBS; i++) {

-      if (vp9_read_bit(&header_bc))

-        pc->ref_pred_probs[i] = vp9_read_prob(&header_bc);

-    }

-  }

+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)

+      cm->active_ref_idx[i] = cm->new_fb_idx;

-  pc->sb64_coded = vp9_read_prob(&header_bc);

-  pc->sb32_coded = vp9_read_prob(&header_bc);

-  xd->lossless = vp9_read_bit(&header_bc);

-  if (xd->lossless) {

-    pc->txfm_mode = ONLY_4X4;

+    setup_frame_size(pbi, rb);

   } else {

-    // Read the loop filter level and type

-    pc->txfm_mode = vp9_read_literal(&header_bc, 2);

-    if (pc->txfm_mode == ALLOW_32X32)

-      pc->txfm_mode += vp9_read_bit(&header_bc);

+    cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);

-    if (pc->txfm_mode == TX_MODE_SELECT) {

-      pc->prob_tx[0] = vp9_read_prob(&header_bc);

-      pc->prob_tx[1] = vp9_read_prob(&header_bc);

-      pc->prob_tx[2] = vp9_read_prob(&header_bc);

-    }

-  }

+    cm->reset_frame_context = cm->error_resilient_mode ?

+        0 : vp9_rb_read_literal(rb, 2);

-  setup_loopfilter(pc, xd, &header_bc);

+    if (cm->intra_only) {

+      check_sync_code(cm, rb);

-  // Dummy read for now

-  vp9_read_literal(&header_bc, 2);

+      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);

+      setup_frame_size(pbi, rb);

+    } else {

+       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);

-  /* Read the default quantizers. */

-  {

-    int q_update = 0;

-    pc->base_qindex = vp9_read_literal(&header_bc, QINDEX_BITS);

+      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {

+        const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LG2);

+        cm->active_ref_idx[i] = cm->ref_frame_map[ref];

+        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);

+      }

-    /* AC 1st order Q = default */

-    pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);

-    pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);

-    pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);

+      setup_frame_size_with_refs(pbi, rb);

-    if (q_update)

-      vp9_init_de_quantizer(pbi);

+      xd->allow_high_precision_mv = vp9_rb_read_bit(rb);

+      cm->mcomp_filter_type = read_interp_filter_type(rb);

-    /* MB level dequantizer setup */

-    mb_init_dequantizer(pbi, &pbi->mb);

+      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)

+        vp9_setup_scale_factors(cm, i);

+      setup_inter_inter(cm);

+    }

-  // Determine if the golden frame or ARF buffer should be updated and how.

-  // For all non key frames the GF and ARF refresh flags and sign bias

-  // flags must be set explicitly.

-  if (pc->frame_type == KEY_FRAME) {

-    pc->active_ref_idx[0] = pc->new_fb_idx;

-    pc->active_ref_idx[1] = pc->new_fb_idx;

-    pc->active_ref_idx[2] = pc->new_fb_idx;

+  if (!cm->error_resilient_mode) {

+    cm->refresh_frame_context = vp9_rb_read_bit(rb);

+    cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);

   } else {

-    // Should the GF or ARF be updated from the current frame

-    pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES);

+    cm->refresh_frame_context = 0;

+    cm->frame_parallel_decoding_mode = 1;

+  }

-    // Select active reference frames

-    for (i = 0; i < 3; i++) {

-      int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);

-      pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num];

-    }

+  cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LG2);

-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);

-    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);

+  if ((cm->frame_type == KEY_FRAME) ||

+      cm->error_resilient_mode || cm->intra_only)

+    vp9_setup_past_independence(cm, xd);

-    // Is high precision mv allowed

-    xd->allow_high_precision_mv = vp9_read_bit(&header_bc);

+  setup_loopfilter(pbi, rb);

+  setup_quantization(pbi, rb);

+  setup_segmentation(pbi, rb);

-    // Read the type of subpel filter to use

-    pc->mcomp_filter_type = vp9_read_bit(&header_bc)

-                                ? SWITCHABLE

-                                : vp9_read_literal(&header_bc, 2);

+  setup_tile_info(cm, rb);

-#if CONFIG_COMP_INTERINTRA_PRED

-    pc->use_interintra = vp9_read_bit(&header_bc);

-#endif

-    // To enable choice of different interploation filters

-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

-  }

+  return vp9_rb_read_literal(rb, 16);

+}

-  if (!pc->error_resilient_mode) {

-    pc->refresh_entropy_probs = vp9_read_bit(&header_bc);

-    pc->frame_parallel_decoding_mode = vp9_read_bit(&header_bc);

-  } else {

-    pc->refresh_entropy_probs = 0;

-    pc->frame_parallel_decoding_mode = 1;

-  }

-  pc->frame_context_idx = vp9_read_literal(&header_bc, NUM_FRAME_CONTEXTS_LG2);

-  vpx_memcpy(&pc->fc, &pc->frame_contexts[pc->frame_context_idx],

-             sizeof(pc->fc));

+int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {

+  int i;

+  vp9_reader header_bc, residual_bc;

+  VP9_COMMON *const pc = &pbi->common;

+  MACROBLOCKD *const xd = &pbi->mb;

-  // Read inter mode probability context updates

-  if (pc->frame_type != KEY_FRAME) {

-    int i, j;

-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {

-      for (j = 0; j < 4; j++) {

-        if (vp9_read(&header_bc, 252)) {

-          pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc);

-        }

-      }

-    }

+  const uint8_t *data = pbi->source;

+  const uint8_t *data_end = pbi->source + pbi->source_sz;

+  struct vp9_read_bit_buffer rb = { data, data_end, 0,

+                                    pc, error_handler };

+  const size_t first_partition_size = read_uncompressed_header(pbi, &rb);

+  const int keyframe = pc->frame_type == KEY_FRAME;

+  YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];

+  if (!first_partition_size) {

+    // showing a frame directly

+    *p_data_end = data + 1;

+    return 0;

-#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS

-  if (pc->frame_type == KEY_FRAME)

-    vp9_adjust_default_coef_probs(pc);

-#endif

+  data += vp9_rb_bytes_read(&rb);

+  xd->corrupted = 0;

+  new_fb->corrupted = 0;

-#if CONFIG_NEW_MVREF

-  // If Key frame reset mv ref id probabilities to defaults

-  if (pc->frame_type != KEY_FRAME) {

-    // Read any mv_ref index probability updates

-    int i, j;

+  if (!pbi->decoded_key_frame && !keyframe)

+    return -1;

-    for (i = 0; i < MAX_REF_FRAMES; ++i) {

-      // Skip the dummy entry for intra ref frame.

-      if (i == INTRA_FRAME) {

-        continue;

-      }

+  if (!read_is_valid(data, first_partition_size, data_end))

+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                       "Truncated packet or corrupt header length");

-      // Read any updates to probabilities

-      for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {

-        if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) {

-          xd->mb_mv_ref_probs[i][j] = vp9_read_prob(&header_bc);

-        }

-      }

-    }

-  }

-#endif

+  xd->mode_info_context = pc->mi;

+  xd->prev_mode_info_context = pc->prev_mi;

+  xd->frame_type = pc->frame_type;

+  xd->mode_info_stride = pc->mode_info_stride;

-  if (0) {

-    FILE *z = fopen("decodestats.stt", "a");

-    fprintf(z, "%6d F:%d,R:%d,Q:%d\n",

-            pc->current_video_frame,

-            pc->frame_type,

-            pbi->refresh_frame_flags,

-            pc->base_qindex);

-    fclose(z);

-  }

+  if (vp9_reader_init(&header_bc, data, first_partition_size))

+    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate bool decoder 0");

-  update_frame_context(pbi, &header_bc);

+  mb_init_dequantizer(pc, &pbi->mb);  // MB level dequantizer setup

+  if (!keyframe)

+    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

+  pc->fc = pc->frame_contexts[pc->frame_context_idx];

+  update_frame_context(&pc->fc);

+  setup_txfm_mode(pc, xd->lossless, &header_bc);

+  read_coef_probs(pbi, &header_bc);

   // Initialize xd pointers. Any reference should do for xd->pre, so use 0.

-  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],

-             sizeof(YV12_BUFFER_CONFIG));

-  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx],

-             sizeof(YV12_BUFFER_CONFIG));

+  setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL,

+                   0, 0, NULL, NULL);

+  setup_dst_planes(xd, new_fb, 0, 0);

   // Create the segmentation map structure and set to 0

   if (!pc->last_frame_seg_map)

     CHECK_MEM_ERROR(pc->last_frame_seg_map,

-                    vpx_calloc((pc->mb_rows * pc->mb_cols), 1));

+                    vpx_calloc((pc->mi_rows * pc->mi_cols), 1));

-  /* set up frame new frame for intra coded blocks */

-  vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);

+  vp9_setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);

-  vp9_setup_block_dptrs(xd);

+  // clear out the coeff buffer

+  for (i = 0; i < MAX_MB_PLANE; ++i)

+    vp9_zero(xd->plane[i].qcoeff);

-  vp9_build_block_doffsets(xd);

+  set_prev_mi(pc);

-  /* clear out the coeff buffer */

-  vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));

-  /* Read the mb_no_coeff_skip flag */

-  pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);

   vp9_decode_mode_mvs_init(pbi, &header_bc);

-  decode_tiles(pbi, data, first_partition_length_in_bytes,

-               &header_bc, &residual_bc);

-  corrupt_tokens |= xd->corrupted;

+  decode_tiles(pbi, data, first_partition_size, &residual_bc);

-  // keep track of the last coded dimensions

   pc->last_width = pc->width;

   pc->last_height = pc->height;

-  // Collect information about decoder corruption.

-  // 1. Check first boolean decoder for errors.

-  // 2. Check the macroblock information

-  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc) |

-                                          corrupt_tokens;

+  new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted;

   if (!pbi->decoded_key_frame) {

-    if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted)

+    if (keyframe && !new_fb->corrupted)

       pbi->decoded_key_frame = 1;

     else

-      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,

+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

                          "A stream must start with a complete key frame");

+  // Adaptation

   if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {

     vp9_adapt_coef_probs(pc);

-#if CONFIG_CODE_NONZEROCOUNT

-    vp9_adapt_nzc_probs(pc);

-#endif

-  }

-  if (pc->frame_type != KEY_FRAME) {

-    if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {

+    if ((!keyframe) && (!pc->intra_only)) {

       vp9_adapt_mode_probs(pc);

+      vp9_adapt_mode_context(pc);

       vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);

-      vp9_adapt_mode_context(&pbi->common);

-  if (pc->refresh_entropy_probs) {

-    vpx_memcpy(&pc->frame_contexts[pc->frame_context_idx], &pc->fc,

-               sizeof(pc->fc));

-  }

+  if (pc->refresh_frame_context)

+    pc->frame_contexts[pc->frame_context_idx] = pc->fc;

-#ifdef PACKET_TESTING

-  {

-    FILE *f = fopen("decompressor.VP8", "ab");

-    unsigned int size = residual_bc.pos + header_bc.pos + 8;

-    fwrite((void *) &size, 4, 1, f);

-    fwrite((void *) pbi->Source, size, 1, f);

-    fclose(f);

-  }

-#endif

-  /* Find the end of the coded buffer */

-  while (residual_bc.count > CHAR_BIT &&

-         residual_bc.count < VP9_BD_VALUE_SIZE) {

-    residual_bc.count -= CHAR_BIT;

-    residual_bc.user_buffer--;

-  }

-  *p_data_end = residual_bc.user_buffer;

+  *p_data_end = vp9_reader_find_end(&residual_bc);

   return 0;

--- a/vp9/decoder/vp9_decodframe.h

+++ b/vp9/decoder/vp9_decodframe.h

@@ -12,8 +12,11 @@

 #ifndef VP9_DECODER_VP9_DECODFRAME_H_

 #define VP9_DECODER_VP9_DECODFRAME_H_

+struct VP9Common;

 struct VP9Decompressor;

-void vp9_init_de_quantizer(struct VP9Decompressor *pbi);

+void vp9_init_dequantizer(struct VP9Common *pc);

+int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);

+vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp);

 #endif  // VP9_DECODER_VP9_DECODFRAME_H_

--- a/vp9/decoder/vp9_dequantize.c

+++ /dev/null

@@ -1,401 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9_rtcd.h"

-#include "vp9/decoder/vp9_dequantize.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp9/decoder/vp9_onyxd_int.h"

-#include "vp9/common/vp9_common.h"

-static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,

-                         uint8_t *dest, int stride, int width, int height) {

-  int r, c;

-  for (r = 0; r < height; r++) {

-    for (c = 0; c < width; c++)

-      dest[c] = clip_pixel(diff[c] + pred[c]);

-    dest += stride;

-    diff += width;

-    pred += pitch;

-  }

-}

-void vp9_add_residual_4x4_c(const int16_t *diff, const uint8_t *pred, int pitch,

-                         uint8_t *dest, int stride) {

-  add_residual(diff, pred, pitch, dest, stride, 4, 4);

-}

-void vp9_add_residual_8x8_c(const int16_t *diff, const uint8_t *pred, int pitch,

-                         uint8_t *dest, int stride) {

-  add_residual(diff, pred, pitch, dest, stride, 8, 8);

-}

-void vp9_add_residual_16x16_c(const int16_t *diff, const uint8_t *pred,

-                              int pitch, uint8_t *dest, int stride) {

-  add_residual(diff, pred, pitch, dest, stride, 16, 16);

-}

-void vp9_add_residual_32x32_c(const int16_t *diff, const uint8_t *pred,

-                              int pitch, uint8_t *dest, int stride) {

-  add_residual(diff, pred, pitch, dest, stride, 32, 32);

-}

-static void add_constant_residual(const int16_t diff, const uint8_t *pred,

-                                  int pitch, uint8_t *dest, int stride,

-                                  int width, int height) {

-  int r, c;

-  for (r = 0; r < height; r++) {

-    for (c = 0; c < width; c++)

-      dest[c] = clip_pixel(diff + pred[c]);

-    dest += stride;

-    pred += pitch;

-  }

-}

-void vp9_add_constant_residual_8x8_c(const int16_t diff, const uint8_t *pred,

-                                     int pitch, uint8_t *dest, int stride) {

-  add_constant_residual(diff, pred, pitch, dest, stride, 8, 8);

-}

-void vp9_add_constant_residual_16x16_c(const int16_t diff, const uint8_t *pred,

-                                       int pitch, uint8_t *dest, int stride) {

-  add_constant_residual(diff, pred, pitch, dest, stride, 16, 16);

-}

-void vp9_add_constant_residual_32x32_c(const int16_t diff, const uint8_t *pred,

-                                       int pitch, uint8_t *dest, int stride) {

-  add_constant_residual(diff, pred, pitch, dest, stride, 32, 32);

-}

-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,

-                               const int16_t *dq,

-                               uint8_t *pred, uint8_t *dest,

-                               int pitch, int stride, int eob) {

-  int i;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  for (i = 0; i < 16; i++)

-    input[i] *= dq[i];

-  vp9_short_iht4x4(input, output, 4, tx_type);

-  vpx_memset(input, 0, 32);

-  vp9_add_residual_4x4(output, pred, pitch, dest, stride);

-}

-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,

-                                   const int16_t *dq,

-                                   uint8_t *pred, uint8_t *dest,

-                                   int pitch, int stride, int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);

-  if (eob == 0) {

-    // All 0 DCT coefficients

-    vp9_copy_mem8x8(pred, pitch, dest, stride);

-  } else if (eob > 0) {

-    int i;

-    input[0] *= dq[0];

-    for (i = 1; i < 64; i++)

-      input[i] *= dq[1];

-    vp9_short_iht8x8(input, output, 8, tx_type);

-    vpx_memset(input, 0, 128);

-    vp9_add_residual_8x8(output, pred, pitch, dest, stride);

-  }

-}

-void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,

-                            uint8_t *dest, int pitch, int stride, int eob) {

-  int i;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  if (eob > 1) {

-    for (i = 0; i < 16; i++)

-      input[i] *= dq[i];

-    // the idct halves ( >> 1) the pitch

-    vp9_short_idct4x4(input, output, 4 << 1);

-    vpx_memset(input, 0, 32);

-    vp9_add_residual_4x4(output, pred, pitch, dest, stride);

-  } else {

-    vp9_dc_only_idct_add(input[0]*dq[0], pred, dest, pitch, stride);

-    ((int *)input)[0] = 0;

-  }

-}

-void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,

-                               uint8_t *dest, int pitch, int stride, int dc) {

-  int i;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  input[0] = dc;

-  for (i = 1; i < 16; i++)

-    input[i] *= dq[i];

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct4x4(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

-  vp9_add_residual_4x4(output, pred, pitch, dest, stride);

-}

-void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,

-                                     uint8_t *pred, uint8_t *dest,

-                                     int pitch, int stride, int eob) {

-  int i;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  if (eob > 1) {

-    for (i = 0; i < 16; i++)

-      input[i] *= dq[i];

-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);

-    vpx_memset(input, 0, 32);

-    vp9_add_residual_4x4(output, pred, pitch, dest, stride);

-  } else {

-    vp9_dc_only_inv_walsh_add(input[0]*dq[0], pred, dest, pitch, stride);

-    ((int *)input)[0] = 0;

-  }

-}

-void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,

-                                        uint8_t *pred,

-                                        uint8_t *dest,

-                                        int pitch, int stride, int dc) {

-  int i;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  input[0] = dc;

-  for (i = 1; i < 16; i++)

-    input[i] *= dq[i];

-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

-  vp9_add_residual_4x4(output, pred, pitch, dest, stride);

-}

-void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,

-                                uint8_t *pred, uint8_t *dest, int pitch,

-                                int stride, int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);

-  // If dc is 1, then input[0] is the reconstructed value, do not need

-  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

-  input[0] *= dq[0];

-  // The calculation can be simplified if there are not many non-zero dct

-  // coefficients. Use eobs to decide what to do.

-  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

-  // Combine that with code here.

-  if (eob == 0) {

-    // All 0 DCT coefficients

-    vp9_copy_mem8x8(pred, pitch, dest, stride);

-  } else if (eob == 1) {

-    // DC only DCT coefficient

-    int16_t in = input[0];

-    int16_t out;

-     // Note: the idct1 will need to be modified accordingly whenever

-     // vp9_short_idct8x8_c() is modified.

-    vp9_short_idct1_8x8_c(&in, &out);

-    input[0] = 0;

-    vp9_add_constant_residual_8x8(out, pred, pitch, dest, stride);

-#if !CONFIG_SCATTERSCAN

-  } else if (eob <= 10) {

-    input[1] *= dq[1];

-    input[2] *= dq[1];

-    input[3] *= dq[1];

-    input[8] *= dq[1];

-    input[9] *= dq[1];

-    input[10] *= dq[1];

-    input[16] *= dq[1];

-    input[17] *= dq[1];

-    input[24] *= dq[1];

-    vp9_short_idct10_8x8(input, output, 16);

-    input[0] = input[1] = input[2] = input[3] = 0;

-    input[8] = input[9] = input[10] = 0;

-    input[16] = input[17] = 0;

-    input[24] = 0;

-    vp9_add_residual_8x8(output, pred, pitch, dest, stride);

-#endif

-  } else {

-    int i;

-    // recover quantizer for 4 4x4 blocks

-    for (i = 1; i < 64; i++)

-      input[i] *= dq[1];

-    // the idct halves ( >> 1) the pitch

-    vp9_short_idct8x8(input, output, 8 << 1);

-    vpx_memset(input, 0, 128);

-    vp9_add_residual_8x8(output, pred, pitch, dest, stride);

-  }

-}

-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,

-                                     const int16_t *dq, uint8_t *pred,

-                                     uint8_t *dest, int pitch, int stride,

-                                     int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);

-  if (eob == 0) {

-    // All 0 DCT coefficients

-    vp9_copy_mem16x16(pred, pitch, dest, stride);

-  } else if (eob > 0) {

-    int i;

-    input[0] *= dq[0];

-    // recover quantizer for 4 4x4 blocks

-    for (i = 1; i < 256; i++)

-      input[i] *= dq[1];

-    // inverse hybrid transform

-    vp9_short_iht16x16(input, output, 16, tx_type);

-    // the idct halves ( >> 1) the pitch

-    // vp9_short_idct16x16(input, output, 32);

-    vpx_memset(input, 0, 512);

-    vp9_add_residual_16x16(output, pred, pitch, dest, stride);

-  }

-}

-void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,

-                                  uint8_t *pred, uint8_t *dest, int pitch,

-                                  int stride, int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);

-  /* The calculation can be simplified if there are not many non-zero dct

-   * coefficients. Use eobs to separate different cases. */

-  if (eob == 0) {

-    /* All 0 DCT coefficient */

-    vp9_copy_mem16x16(pred, pitch, dest, stride);

-  } else if (eob == 1) {

-    /* DC only DCT coefficient. */

-    int16_t in = input[0] * dq[0];

-    int16_t out;

-    /* Note: the idct1 will need to be modified accordingly whenever

-     * vp9_short_idct16x16() is modified. */

-    vp9_short_idct1_16x16_c(&in, &out);

-    input[0] = 0;

-    vp9_add_constant_residual_16x16(out, pred, pitch, dest, stride);

-#if !CONFIG_SCATTERSCAN

-  } else if (eob <= 10) {

-    input[0] *= dq[0];

-    input[1] *= dq[1];

-    input[2] *= dq[1];

-    input[3] *= dq[1];

-    input[16] *= dq[1];

-    input[17] *= dq[1];

-    input[18] *= dq[1];

-    input[32] *= dq[1];

-    input[33] *= dq[1];

-    input[48] *= dq[1];

-    // the idct halves ( >> 1) the pitch

-    vp9_short_idct10_16x16(input, output, 32);

-    input[0] = input[1] = input[2] = input[3] = 0;

-    input[16] = input[17] = input[18] = 0;

-    input[32] = input[33] = 0;

-    input[48] = 0;

-    vp9_add_residual_16x16(output, pred, pitch, dest, stride);

-#endif

-  } else {

-    int i;

-    input[0] *= dq[0];

-    // recover quantizer for 4 4x4 blocks

-    for (i = 1; i < 256; i++)

-      input[i] *= dq[1];

-    // the idct halves ( >> 1) the pitch

-    vp9_short_idct16x16(input, output, 16 << 1);

-    vpx_memset(input, 0, 512);

-    vp9_add_residual_16x16(output, pred, pitch, dest, stride);

-  }

-}

-void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,

-                                  uint8_t *pred, uint8_t *dest, int pitch,

-                                  int stride, int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);

-  if (eob) {

-    input[0] = input[0] * dq[0] / 2;

-    if (eob == 1) {

-      vp9_short_idct1_32x32(input, output);

-      vp9_add_constant_residual_32x32(output[0], pred, pitch, dest, stride);

-      input[0] = 0;

-#if !CONFIG_SCATTERSCAN

-    } else if (eob <= 10) {

-      input[1] = input[1] * dq[1] / 2;

-      input[2] = input[2] * dq[1] / 2;

-      input[3] = input[3] * dq[1] / 2;

-      input[32] = input[32] * dq[1] / 2;

-      input[33] = input[33] * dq[1] / 2;

-      input[34] = input[34] * dq[1] / 2;

-      input[64] = input[64] * dq[1] / 2;

-      input[65] = input[65] * dq[1] / 2;

-      input[96] = input[96] * dq[1] / 2;

-      // the idct halves ( >> 1) the pitch

-      vp9_short_idct10_32x32(input, output, 64);

-      input[0] = input[1] = input[2] = input[3] = 0;

-      input[32] = input[33] = input[34] = 0;

-      input[64] = input[65] = 0;

-      input[96] = 0;

-      vp9_add_residual_32x32(output, pred, pitch, dest, stride);

-#endif

-    } else {

-      int i;

-      for (i = 1; i < 1024; i++)

-        input[i] = input[i] * dq[1] / 2;

-      vp9_short_idct32x32(input, output, 64);

-      vpx_memset(input, 0, 2048);

-      vp9_add_residual_32x32(output, pred, pitch, dest, stride);

-    }

-  }

-}

-void vp9_dequant_idct_add_uv_block_16x16_c(int16_t *q, const int16_t *dq,

-                                           uint8_t *dstu,

-                                           uint8_t *dstv,

-                                           int stride,

-                                           MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride,

-                               xd->eobs[64]);

-  vp9_dequant_idct_add_16x16_c(q + 256, dq, dstv, dstv, stride, stride,

-                               xd->eobs[80]);

-}

--- a/vp9/decoder/vp9_dequantize.h

+++ /dev/null

@@ -1,96 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_DECODER_VP9_DEQUANTIZE_H_

-#define VP9_DECODER_VP9_DEQUANTIZE_H_

-#include "vp9/common/vp9_blockd.h"

-void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,

-                                     unsigned char *pred,

-                                     unsigned char *output,

-                                     int pitch, int stride, int eob);

-void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,

-                                        unsigned char *pred,

-                                        unsigned char *output,

-                                        int pitch, int stride, int dc);

-void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,

-                                                const int16_t *dq,

-                                                unsigned char *pre,

-                                                unsigned char *dst,

-                                                int stride,

-                                                const int16_t *dc);

-void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,

-                                             unsigned char *pre,

-                                             unsigned char *dst,

-                                             int stride,

-                                             struct macroblockd *xd);

-void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,

-                                              unsigned char *pre,

-                                              unsigned char *dst_u,

-                                              unsigned char *dst_v,

-                                              int stride,

-                                              struct macroblockd *xd);

-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,

-                                    unsigned char *pred, unsigned char *dest,

-                                    int pitch, int stride, int eob);

-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,

-                                   const int16_t *dq, unsigned char *pred,

-                                   unsigned char *dest, int pitch, int stride,

-                                   int eob);

-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,

-                                     const int16_t *dq, unsigned char *pred,

-                                     unsigned char *dest,

-                                     int pitch, int stride, int eob);

-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

-                                                   unsigned char *dst,

-                                                   int stride,

-                                                   const int16_t *dc,

-                                                   MACROBLOCKD *xd);

-void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

-                                                unsigned char *dst,

-                                                int stride,

-                                                MACROBLOCKD *xd);

-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,

-                                                   unsigned char *dst,

-                                                   int stride,

-                                                   const int16_t *dc,

-                                                   MACROBLOCKD *xd);

-void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,

-                                                unsigned char *dst,

-                                                int stride,

-                                                MACROBLOCKD *xd);

-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

-                                                 unsigned char *dstu,

-                                                 unsigned char *dstv,

-                                                 int stride,

-                                                 MACROBLOCKD *xd);

-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,

-                                                 unsigned char *dstu,

-                                                 unsigned char *dstv,

-                                                 int stride,

-                                                 MACROBLOCKD *xd);

-#endif  // VP9_DECODER_VP9_DEQUANTIZE_H_

--- a/vp9/decoder/vp9_detokenize.c

+++ b/vp9/decoder/vp9_detokenize.c

@@ -10,6 +10,7 @@

 #include "vp9/common/vp9_blockd.h"

+#include "vp9/common/vp9_common.h"

 #include "vp9/decoder/vp9_onyxd_int.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_ports/mem.h"

@@ -16,8 +17,13 @@

 #include "vp9/decoder/vp9_detokenize.h"

 #include "vp9/common/vp9_seg_common.h"

+#if CONFIG_BALANCED_COEFTREE

+#define ZERO_CONTEXT_NODE           0

+#define EOB_CONTEXT_NODE            1

+#else

 #define EOB_CONTEXT_NODE            0

 #define ZERO_CONTEXT_NODE           1

+#endif

 #define ONE_CONTEXT_NODE            2

 #define LOW_VAL_CONTEXT_NODE        3

 #define TWO_CONTEXT_NODE            4

@@ -57,236 +63,185 @@

   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0

};

-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);

-static int16_t get_signed(BOOL_DECODER *br, int16_t value_to_sign) {

-  return decode_bool(br, 128) ? -value_to_sign : value_to_sign;

-}

+DECLARE_ALIGNED(16, extern const uint8_t,

+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);

 #define INCREMENT_COUNT(token)               \

   do {                                       \

-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \

-               [pt][token]++;     \

-    token_cache[c] = token; \

-    pt = vp9_get_coef_context(scan, nb, pad, token_cache,     \

-                              c + 1, default_eob); \

+    coef_counts[type][ref][band][pt]         \

+               [token >= TWO_TOKEN ?     \

+                (token == DCT_EOB_TOKEN ? DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \

+                token]++;     \

+    token_cache[scan[c]] = vp9_pt_energy_class[token]; \

   } while (0)

-#if CONFIG_CODE_NONZEROCOUNT

-#define WRITE_COEF_CONTINUE(val, token)                       \

-  {                                                           \

-    qcoeff_ptr[scan[c]] = get_signed(br, val);                \

-    INCREMENT_COUNT(token);                                   \

-    c++;                                                      \

-    nzc++;                                                    \

-    continue;                                                 \

-  }

-#else

 #define WRITE_COEF_CONTINUE(val, token)                  \

   {                                                      \

-    qcoeff_ptr[scan[c]] = get_signed(br, val);           \

+    qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \

+                            dq[c > 0] / (1 + (txfm_size == TX_32X32)); \

     INCREMENT_COUNT(token);                              \

     c++;                                                 \

     continue;                                            \

-#endif  // CONFIG_CODE_NONZEROCOUNT

 #define ADJUST_COEF(prob, bits_count)  \

   do {                                 \

-    if (vp9_read(br, prob))            \

+    if (vp9_read(r, prob))             \

       val += 1 << bits_count;          \

   } while (0);

 static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,

-                        BOOL_DECODER* const br, int block_idx,

+                        vp9_reader *r, int block_idx,

                         PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,

-                        TX_SIZE txfm_size) {

-  ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context;

-  ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context;

-  int aidx, lidx;

+                        TX_SIZE txfm_size, const int16_t *dq,

+                        ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {

   ENTROPY_CONTEXT above_ec, left_ec;

   FRAME_CONTEXT *const fc = &dx->common.fc;

   int pt, c = 0, pad, default_eob;

-  vp9_coeff_probs *coef_probs;

+  int band;

+  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];

+  vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = {

+    {0, 0, 0, 0, 0, 0},

+    {0, 0, 0, 0, 0, 0},

+    {0, 0, 0, 0, 0, 0},

+    {0, 0, 0, 0, 0, 0},

+    {0, 0, 0, 0, 0, 0},

+    {0, 0, 0, 0, 0, 0},

+  };

   vp9_prob *prob;

-  vp9_coeff_count *coef_counts;

-  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;

-#if CONFIG_CODE_NONZEROCOUNT

-  uint16_t nzc = 0;

-  uint16_t nzc_expected = xd->mode_info_context->mbmi.nzcs[block_idx];

-#endif

+  vp9_coeff_count_model *coef_counts;

+  const int ref = xd->mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;

+  TX_TYPE tx_type = DCT_DCT;

   const int *scan, *nb;

   uint8_t token_cache[1024];

+  const uint8_t * band_translate;

+#if CONFIG_BALANCED_COEFTREE

+  int skip_eob_node = 0;

+#endif

-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-    aidx = vp9_block2above_sb64[txfm_size][block_idx];

-    lidx = vp9_block2left_sb64[txfm_size][block_idx];

-  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

-    aidx = vp9_block2above_sb[txfm_size][block_idx];

-    lidx = vp9_block2left_sb[txfm_size][block_idx];

-  } else {

-    aidx = vp9_block2above[txfm_size][block_idx];

-    lidx = vp9_block2left[txfm_size][block_idx];

-  }

+  coef_probs  = fc->coef_probs[txfm_size][type][ref];

+  coef_counts = fc->coef_counts[txfm_size];

   switch (txfm_size) {

     default:

     case TX_4X4: {

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_4x4(xd, block_idx) : DCT_DCT;

-      switch (tx_type) {

-        default:

-          scan = vp9_default_zig_zag1d_4x4;

-          break;

-        case ADST_DCT:

-          scan = vp9_row_scan_4x4;

-          break;

-        case DCT_ADST:

-          scan = vp9_col_scan_4x4;

-          break;

-      }

-      above_ec = A0[aidx] != 0;

-      left_ec = L0[lidx] != 0;

-      coef_probs  = fc->coef_probs_4x4;

-      coef_counts = fc->coef_counts_4x4;

+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_4x4(xd, block_idx) : DCT_DCT;

+      scan = get_scan_4x4(tx_type);

+      above_ec = A[0] != 0;

+      left_ec = L[0] != 0;

       default_eob = 16;

+      band_translate = vp9_coefband_trans_4x4;

       break;

     case TX_8X8: {

       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-      const int sz = 3 + sb_type, x = block_idx & ((1 << sz) - 1);

+      const int sz = 1 + b_width_log2(sb_type);

+      const int x = block_idx & ((1 << sz) - 1);

       const int y = block_idx - x;

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

-      switch (tx_type) {

-        default:

-          scan = vp9_default_zig_zag1d_8x8;

-          break;

-        case ADST_DCT:

-          scan = vp9_row_scan_8x8;

-          break;

-        case DCT_ADST:

-          scan = vp9_col_scan_8x8;

-          break;

-      }

-      coef_probs  = fc->coef_probs_8x8;

-      coef_counts = fc->coef_counts_8x8;

-      above_ec = (A0[aidx] + A0[aidx + 1]) != 0;

-      left_ec  = (L0[lidx] + L0[lidx + 1]) != 0;

+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

+      scan = get_scan_8x8(tx_type);

+      above_ec = (A[0] + A[1]) != 0;

+      left_ec = (L[0] + L[1]) != 0;

       default_eob = 64;

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_16X16: {

       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-      const int sz = 4 + sb_type, x = block_idx & ((1 << sz) - 1);

+      const int sz = 2 + b_width_log2(sb_type);

+      const int x = block_idx & ((1 << sz) - 1);

       const int y = block_idx - x;

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

-      switch (tx_type) {

-        default:

-          scan = vp9_default_zig_zag1d_16x16;

-          break;

-        case ADST_DCT:

-          scan = vp9_row_scan_16x16;

-          break;

-        case DCT_ADST:

-          scan = vp9_col_scan_16x16;

-          break;

-      }

-      coef_probs  = fc->coef_probs_16x16;

-      coef_counts = fc->coef_counts_16x16;

-      if (type == PLANE_TYPE_UV) {

-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

-        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1]) != 0;

-        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1]) != 0;

-      } else {

-        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3]) != 0;

-        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3]) != 0;

-      }

+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

+      scan = get_scan_16x16(tx_type);

+      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;

+      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;

       default_eob = 256;

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_32X32:

-      scan = vp9_default_zig_zag1d_32x32;

-      coef_probs = fc->coef_probs_32x32;

-      coef_counts = fc->coef_counts_32x32;

-      if (type == PLANE_TYPE_UV) {

-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

-        ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);

-        ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);

-        ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);

-        ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);

-        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1] +

-                    A2[aidx] + A2[aidx + 1] + A3[aidx] + A3[aidx + 1]) != 0;

-        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1] +

-                    L2[lidx] + L2[lidx + 1] + L3[lidx] + L3[lidx + 1]) != 0;

-      } else {

-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

-        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3] +

-                    A1[aidx] + A1[aidx + 1] + A1[aidx + 2] + A1[aidx + 3]) != 0;

-        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3] +

-                    L1[lidx] + L1[lidx + 1] + L1[lidx + 2] + L1[lidx + 3]) != 0;

-      }

+      scan = vp9_default_scan_32x32;

+      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;

+      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;

       default_eob = 1024;

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

-  VP9_COMBINEENTROPYCONTEXTS(pt, above_ec, left_ec);

+  pt = combine_entropy_contexts(above_ec, left_ec);

   nb = vp9_get_coef_neighbors_handle(scan, &pad);

   while (1) {

     int val;

     const uint8_t *cat6 = cat6_prob;

     if (c >= seg_eob)

       break;

-#if CONFIG_CODE_NONZEROCOUNT

-    if (nzc == nzc_expected)

+    if (c)

+      pt = vp9_get_coef_context(scan, nb, pad, token_cache,

+                                c, default_eob);

+    band = get_coef_band(band_translate, c);

+    prob = coef_probs[band][pt];

+#if !CONFIG_BALANCED_COEFTREE

+    fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;

+    if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))

       break;

-#endif

-    prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];

-#if CONFIG_CODE_NONZEROCOUNT == 0

-    fc->eob_branch_counts[txfm_size][type][ref]

-                         [get_coef_band(scan, txfm_size, c)][pt]++;

-    if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))

-      break;

-#endif

 SKIP_START:

+#endif

     if (c >= seg_eob)

       break;

-#if CONFIG_CODE_NONZEROCOUNT

-    if (nzc == nzc_expected)

-      break;

-    // decode zero node only if there are zeros left

-    if (seg_eob - nzc_expected - c + nzc > 0)

-#endif

-    if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {

+    if (c)

+      pt = vp9_get_coef_context(scan, nb, pad, token_cache,

+                                c, default_eob);

+    band = get_coef_band(band_translate, c);

+    prob = coef_probs[band][pt];

+    if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {

       INCREMENT_COUNT(ZERO_TOKEN);

       ++c;

-      prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];

+#if CONFIG_BALANCED_COEFTREE

+      skip_eob_node = 1;

+      continue;

+#else

       goto SKIP_START;

+#endif

+#if CONFIG_BALANCED_COEFTREE

+    if (!skip_eob_node) {

+      fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;

+      if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))

+        break;

+    }

+    skip_eob_node = 0;

+#endif

     // ONE_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {

+    if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {

       WRITE_COEF_CONTINUE(1, ONE_TOKEN);

+    // Load full probabilities if not already loaded

+    if (!load_map[band][pt]) {

+      vp9_model_to_full_probs(coef_probs[band][pt],

+                              coef_probs_full[band][pt]);

+      load_map[band][pt] = 1;

+    }

+    prob = coef_probs_full[band][pt];

     // LOW_VAL_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {

-      if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {

+    if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {

+      if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {

         WRITE_COEF_CONTINUE(2, TWO_TOKEN);

-      if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {

+      if (!vp9_read(r, prob[THREE_CONTEXT_NODE])) {

         WRITE_COEF_CONTINUE(3, THREE_TOKEN);

       WRITE_COEF_CONTINUE(4, FOUR_TOKEN);

     // HIGH_LOW_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {

-      if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {

+    if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {

+      if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {

         val = CAT1_MIN_VAL;

         ADJUST_COEF(CAT1_PROB0, 0);

         WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY1);

@@ -297,8 +252,8 @@

       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY2);

     // CAT_THREEFOUR_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {

-      if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {

+    if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {

+      if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {

         val = CAT3_MIN_VAL;

         ADJUST_COEF(CAT3_PROB2, 2);

         ADJUST_COEF(CAT3_PROB1, 1);

@@ -313,7 +268,7 @@

       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY4);

     // CAT_FIVE_CONTEXT_NODE_0_:

-    if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {

+    if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {

       val = CAT5_MIN_VAL;

       ADJUST_COEF(CAT5_PROB4, 4);

       ADJUST_COEF(CAT5_PROB3, 3);

@@ -324,262 +279,73 @@

     val = 0;

     while (*cat6) {

-      val = (val << 1) | vp9_read(br, *cat6++);

+      val = (val << 1) | vp9_read(r, *cat6++);

     val += CAT6_MIN_VAL;

     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);

-#if CONFIG_CODE_NONZEROCOUNT == 0

   if (c < seg_eob)

-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]

-               [pt][DCT_EOB_TOKEN]++;

-#endif

+    coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++;

-  A0[aidx] = L0[lidx] = c > 0;

-  if (txfm_size >= TX_8X8) {

-    A0[aidx + 1] = L0[lidx + 1] = A0[aidx];

-    if (txfm_size >= TX_16X16) {

-      if (type == PLANE_TYPE_UV) {

-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

-        A1[aidx] = A1[aidx + 1] = L1[lidx] = L1[lidx + 1] = A0[aidx];

-        if (txfm_size >= TX_32X32) {

-          ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);

-          ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);

-          ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);

-          ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);

-          A2[aidx] = A2[aidx + 1] = A3[aidx] = A3[aidx + 1] = A0[aidx];

-          L2[lidx] = L2[lidx + 1] = L3[lidx] = L3[lidx + 1] = A0[aidx];

-        }

-      } else {

-        A0[aidx + 2] = A0[aidx + 3] = L0[lidx + 2] = L0[lidx + 3] = A0[aidx];

-        if (txfm_size >= TX_32X32) {

-          ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

-          ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

-          A1[aidx] = A1[aidx + 1] = A1[aidx + 2] = A1[aidx + 3] = A0[aidx];

-          L1[lidx] = L1[lidx + 1] = L1[lidx + 2] = L1[lidx + 3] = A0[aidx];

-        }

-      }

-    }

-  }

   return c;

 static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {

-  return vp9_get_segdata(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;

+  return vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;

-static INLINE int decode_sb(VP9D_COMP* const pbi,

-                            MACROBLOCKD* const xd,

-                            BOOL_DECODER* const bc,

-                            int offset, int count, int inc,

-                            int eob_max, TX_SIZE tx_size) {

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int seg_eob = get_eob(xd, segment_id, eob_max);

-  int i, eobtotal = 0;

+struct decode_block_args {

+  VP9D_COMP *pbi;

+  MACROBLOCKD *xd;

+  vp9_reader *r;

+  int *eobtotal;

+};

+static void decode_block(int plane, int block,

+                         BLOCK_SIZE_TYPE bsize,

+                         int ss_txfrm_size,

+                         void *argv) {

+  const struct decode_block_args* const arg = argv;

+  const int bw = b_width_log2(bsize);

-  // luma blocks

-  for (i = 0; i < offset; i += inc) {

-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, seg_eob,

-                               xd->qcoeff + i * 16, tx_size);

-    xd->eobs[i] = c;

-    eobtotal += c;

-  }

+  // find the maximum eob for this transform size, adjusted by segment

+  MACROBLOCKD *xd = arg->xd;

+  const int segment_id = arg->xd->mode_info_context->mbmi.segment_id;

+  const TX_SIZE ss_tx_size = ss_txfrm_size / 2;

+  const int seg_eob = get_eob(arg->xd, segment_id, 16 << ss_txfrm_size);

+  int16_t* const qcoeff_base = arg->xd->plane[plane].qcoeff;

+  const int off = block >> ss_txfrm_size;

+  const int mod = bw - ss_tx_size - arg->xd->plane[plane].subsampling_x;

+  const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;

+  const int loff = (off >> mod) << ss_tx_size;

+  int pt;

+  ENTROPY_CONTEXT *A = arg->xd->plane[plane].above_context + aoff;

+  ENTROPY_CONTEXT *L = arg->xd->plane[plane].left_context + loff;

+  const int eob = decode_coefs(arg->pbi, arg->xd, arg->r, block,

+                               arg->xd->plane[plane].plane_type, seg_eob,

+                               BLOCK_OFFSET(qcoeff_base, block, 16),

+                               ss_tx_size, arg->xd->plane[plane].dequant,

+                               A,

+                               L);

-  // chroma blocks

-  for (i = offset; i < count; i += inc) {

-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,

-                               xd->qcoeff + i * 16, tx_size);

-    xd->eobs[i] = c;

-    eobtotal += c;

-  }

-  return eobtotal;

-}

-int vp9_decode_sb_tokens(VP9D_COMP* const pbi,

-                         MACROBLOCKD* const xd,

-                         BOOL_DECODER* const bc) {

-  switch (xd->mode_info_context->mbmi.txfm_size) {

-    case TX_32X32: {

-      // 32x32 luma block

-      const int segment_id = xd->mode_info_context->mbmi.segment_id;

-      int i, eobtotal = 0, seg_eob;

-      int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,

-                       get_eob(xd, segment_id, 1024), xd->qcoeff, TX_32X32);

-      xd->eobs[0] = c;

-      eobtotal += c;

-      // 16x16 chroma blocks

-      seg_eob = get_eob(xd, segment_id, 256);

-      for (i = 64; i < 96; i += 16) {

-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,

-                         xd->qcoeff + i * 16, TX_16X16);

-        xd->eobs[i] = c;

-        eobtotal += c;

-      }

-      return eobtotal;

-    }

-    case TX_16X16:

-      return decode_sb(pbi, xd, bc, 64, 96, 16, 16 * 16, TX_16X16);

-    case TX_8X8:

-      return decode_sb(pbi, xd, bc, 64, 96, 4, 8 * 8, TX_8X8);

-    case TX_4X4:

-      return decode_sb(pbi, xd, bc, 64, 96, 1, 4 * 4, TX_4X4);

-    default:

-      assert(0);

-      return 0;

-  }

-}

-int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,

-                           MACROBLOCKD* const xd,

-                           BOOL_DECODER* const bc) {

-  switch (xd->mode_info_context->mbmi.txfm_size) {

-    case TX_32X32:

-      return decode_sb(pbi, xd, bc, 256, 384, 64, 32 * 32, TX_32X32);

-    case TX_16X16:

-      return decode_sb(pbi, xd, bc, 256, 384, 16, 16 * 16, TX_16X16);

-    case TX_8X8:

-      return decode_sb(pbi, xd, bc, 256, 384, 4, 8 * 8, TX_8X8);

-    case TX_4X4:

-      return decode_sb(pbi, xd, bc, 256, 384, 1, 4 * 4, TX_4X4);

-    default:

-      assert(0);

-      return 0;

-  }

-}

-static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,

-                                      MACROBLOCKD* const xd,

-                                      BOOL_DECODER* const bc) {

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int i, eobtotal = 0, seg_eob;

-  // Luma block

-  int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,

-                       get_eob(xd, segment_id, 256), xd->qcoeff, TX_16X16);

-  xd->eobs[0] = c;

-  eobtotal += c;

-  // 8x8 chroma blocks

-  seg_eob = get_eob(xd, segment_id, 64);

-  for (i = 16; i < 24; i += 4) {

-    c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,

-                     seg_eob, xd->block[i].qcoeff, TX_8X8);

-    xd->eobs[i] = c;

-    eobtotal += c;

-  }

-  return eobtotal;

-}

-static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,

-                                    MACROBLOCKD* const xd,

-                                    BOOL_DECODER* const bc) {

-  int i, eobtotal = 0;

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  // luma blocks

-  int seg_eob = get_eob(xd, segment_id, 64);

-  for (i = 0; i < 16; i += 4) {

-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,

-                               seg_eob, xd->block[i].qcoeff, TX_8X8);

-    xd->eobs[i] = c;

-    eobtotal += c;

-  }

-  // chroma blocks

-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-      xd->mode_info_context->mbmi.mode == SPLITMV) {

-    // use 4x4 transform for U, V components in I8X8/splitmv prediction mode

-    seg_eob = get_eob(xd, segment_id, 16);

-    for (i = 16; i < 24; i++) {

-      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,

-                                 seg_eob, xd->block[i].qcoeff, TX_4X4);

-      xd->eobs[i] = c;

-      eobtotal += c;

-    }

+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {

+    set_contexts_on_border(xd, bsize, plane, ss_tx_size, eob, aoff, loff, A, L);

   } else {

-    for (i = 16; i < 24; i += 4) {

-      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,

-                                 seg_eob, xd->block[i].qcoeff, TX_8X8);

-      xd->eobs[i] = c;

-      eobtotal += c;

+    for (pt = 0; pt < (1 << ss_tx_size); pt++) {

+      A[pt] = L[pt] = eob > 0;

-  return eobtotal;

+  arg->xd->plane[plane].eobs[block] = eob;

+  arg->eobtotal[0] += eob;

-static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,

-                            BOOL_DECODER* const bc,

-                            PLANE_TYPE type, int i, int seg_eob) {

-  const int c = decode_coefs(dx, xd, bc, i, type, seg_eob,

-                             xd->block[i].qcoeff, TX_4X4);

-  xd->eobs[i] = c;

-  return c;

-}

-int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,

-                         BOOL_DECODER* const bc,

-                         PLANE_TYPE type, int i) {

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int seg_eob = get_eob(xd, segment_id, 16);

-  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob);

-}

-static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,

-                                   MACROBLOCKD* const xd,

-                                   BOOL_DECODER* const bc,

-                                   int seg_eob) {

-  int i, eobtotal = 0;

-  // chroma blocks

-  for (i = 16; i < 24; i++)

-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob);

-  return eobtotal;

-}

-int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,

-                                MACROBLOCKD* const xd,

-                                BOOL_DECODER* const bc) {

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int seg_eob = get_eob(xd, segment_id, 16);

-  return decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);

-}

-static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,

-                                    MACROBLOCKD* const xd,

-                                    BOOL_DECODER* const bc) {

-  int i, eobtotal = 0;

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int seg_eob = get_eob(xd, segment_id, 16);

-  // luma blocks

-  for (i = 0; i < 16; ++i)

-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y_WITH_DC, i, seg_eob);

-  // chroma blocks

-  eobtotal += decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);

-  return eobtotal;

-}

-int vp9_decode_mb_tokens(VP9D_COMP* const dx,

+int vp9_decode_tokens(VP9D_COMP* const pbi,

                          MACROBLOCKD* const xd,

-                         BOOL_DECODER* const bc) {

-  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  switch (tx_size) {

-    case TX_16X16:

-      return vp9_decode_mb_tokens_16x16(dx, xd, bc);

-    case TX_8X8:

-      return vp9_decode_mb_tokens_8x8(dx, xd, bc);

-    default:

-      assert(tx_size == TX_4X4);

-      return vp9_decode_mb_tokens_4x4(dx, xd, bc);

-  }

+                         vp9_reader *r,

+                         BLOCK_SIZE_TYPE bsize) {

+  int eobtotal = 0;

+  struct decode_block_args args = {pbi, xd, r, &eobtotal};

+  foreach_transformed_block(xd, bsize, decode_block, &args);

+  return eobtotal;

--- a/vp9/decoder/vp9_detokenize.h

+++ b/vp9/decoder/vp9_detokenize.h

@@ -14,22 +14,9 @@

 #include "vp9/decoder/vp9_onyxd_int.h"

-int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,

-                         BOOL_DECODER* const bc,

-                         PLANE_TYPE type, int i);

-int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,

-                         BOOL_DECODER* const);

-int vp9_decode_sb_tokens(VP9D_COMP* const pbi,

-                         MACROBLOCKD* const xd,

-                         BOOL_DECODER* const bc);

-int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,

-                           MACROBLOCKD* const xd,

-                           BOOL_DECODER* const bc);

-int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,

-                                BOOL_DECODER* const bc);

+int vp9_decode_tokens(VP9D_COMP* const pbi,

+                      MACROBLOCKD* const xd,

+                      vp9_reader *r,

+                      BLOCK_SIZE_TYPE bsize);

 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_

--- a/vp9/decoder/vp9_idct_blk.c

+++ b/vp9/decoder/vp9_idct_blk.c

@@ -10,18 +10,15 @@

 #include "vp9_rtcd.h"

 #include "vp9/common/vp9_blockd.h"

-#include "vp9/decoder/vp9_dequantize.h"

+#include "vp9/decoder/vp9_idct_blk.h"

-void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q,

-                                                const int16_t *dq,

-                                                uint8_t *dst,

-                                                int stride,

-                                                MACROBLOCKD *xd) {

+void vp9_idct_add_y_block_c(int16_t *q, uint8_t *dst, int stride,

+                            MACROBLOCKD *xd) {

   int i, j;

   for (i = 0; i < 4; i++) {

     for (j = 0; j < 4; j++) {

-      xd->itxm_add(q, dq, dst, dst, stride, stride, xd->eobs[i * 4 + j]);

+      vp9_idct_add(q, dst, stride, xd->plane[0].eobs[i * 4  + j]);

       q   += 16;

       dst += 4;

@@ -30,202 +27,205 @@

-void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,

-                                    uint8_t *pre,

-                                    uint8_t *dst,

-                                    int stride, MACROBLOCKD *xd) {

+void vp9_idct_add_uv_block_c(int16_t *q, uint8_t *dst, int stride,

+                             uint16_t *eobs) {

   int i, j;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      vp9_dequant_idct_add(q, dq, pre, dst, 16, stride, xd->eobs[i * 4  + j]);

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      vp9_idct_add(q, dst, stride, eobs[i * 2 + j]);

       q   += 16;

-      pre += 4;

       dst += 4;

-    pre += 64 - 16;

-    dst += 4 * stride - 16;

+    dst += 4 * stride - 8;

-void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,

-                                     uint8_t *pre, uint8_t *dstu,

-                                     uint8_t *dstv, int stride,

+void vp9_idct_add_y_block_8x8_c(int16_t *q, uint8_t *dst, int stride,

+                                MACROBLOCKD *xd) {

+  uint8_t *origdest = dst;

+  vp9_idct_add_8x8_c(q, dst, stride, xd->plane[0].eobs[0]);

+  vp9_idct_add_8x8_c(&q[64], origdest + 8, stride, xd->plane[0].eobs[4]);

+  vp9_idct_add_8x8_c(&q[128], origdest + 8 * stride, stride,

+                     xd->plane[0].eobs[8]);

+  vp9_idct_add_8x8_c(&q[192], origdest + 8 * stride + 8, stride,

+                     xd->plane[0].eobs[12]);

+}

+void vp9_idct_add_y_block_lossless_c(int16_t *q, uint8_t *dst, int stride,

                                      MACROBLOCKD *xd) {

   int i, j;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      vp9_dequant_idct_add(q, dq, pre, dstu, 8, stride,

-                           xd->eobs[16 + i * 2 + j]);

-      q    += 16;

-      pre  += 4;

-      dstu += 4;

+  for (i = 0; i < 4; i++) {

+    for (j = 0; j < 4; j++) {

+      vp9_idct_add_lossless_c(q, dst, stride, xd->plane[0].eobs[i * 4 + j]);

+      q   += 16;

+      dst += 4;

-    pre  += 32 - 8;

-    dstu += 4 * stride - 8;

+    dst += 4 * stride - 16;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      vp9_dequant_idct_add(q, dq, pre, dstv, 8, stride,

-                           xd->eobs[20 + i * 2 + j]);

-      q    += 16;

-      pre  += 4;

-      dstv += 4;

-    }

-    pre  += 32 - 8;

-    dstv += 4 * stride - 8;

-  }

-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,

-                                                 uint8_t *dstu,

-                                                 uint8_t *dstv,

-                                                 int stride,

-                                                 MACROBLOCKD *xd) {

+void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,

+                                      uint16_t *eobs) {

   int i, j;

   for (i = 0; i < 2; i++) {

     for (j = 0; j < 2; j++) {

-      xd->itxm_add(q, dq, dstu, dstu, stride, stride, xd->eobs[16 + i * 2 + j]);

-      q    += 16;

-      dstu += 4;

+      vp9_idct_add_lossless_c(q, dst, stride, eobs[i * 2 + j]);

+      q   += 16;

+      dst += 4;

-    dstu += 4 * stride - 8;

+    dst += 4 * stride - 8;

+}

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      xd->itxm_add(q, dq, dstv, dstv, stride, stride, xd->eobs[20 + i * 2 + j]);

-      q    += 16;

-      dstv += 4;

-    }

+static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,

+                                  int width, int height) {

+  int r, c;

-    dstv += 4 * stride - 8;

+  for (r = 0; r < height; r++) {

+    for (c = 0; c < width; c++)

+      dest[c] = clip_pixel(diff + dest[c]);

+    dest += stride;

-void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q,

-                                                const int16_t *dq,

-                                                uint8_t *dst,

-                                                int stride,

-                                                MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, xd->eobs[0]);

+void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,

+                                     int stride) {

+  add_constant_residual(diff, dest, stride, 8, 8);

+}

-  vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,

-                             dst + 8, stride, stride, xd->eobs[4]);

+void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,

+                                       int stride) {

+  add_constant_residual(diff, dest, stride, 16, 16);

+}

-  vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,

-                             dst + 8 * stride, stride, stride,

-                             xd->eobs[8]);

+void vp9_add_constant_residual_32x32_c(const int16_t diff,  uint8_t *dest,

+                                       int stride) {

+  add_constant_residual(diff, dest, stride, 32, 32);

+}

-  vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,

-                             dst + 8 * stride + 8, stride, stride,

-                             xd->eobs[12]);

+void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,

+                   int eob) {

+  if (tx_type == DCT_DCT) {

+    vp9_idct_add(input, dest, stride, eob);

+  } else {

+    vp9_short_iht4x4_add(input, dest, stride, tx_type);

+    vpx_memset(input, 0, 32);

+  }

-void vp9_dequant_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,

-                                        uint8_t *pre,

-                                        uint8_t *dst,

-                                        int stride, MACROBLOCKD *xd) {

-  uint8_t *origdest = dst;

-  uint8_t *origpred = pre;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, xd->eobs[0]);

-  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,

-                             origdest + 8, 16, stride, xd->eobs[4]);

-  vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,

-                             origdest + 8 * stride, 16, stride,

-                             xd->eobs[8]);

-  vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,

-                             origdest + 8 * stride + 8, 16, stride,

-                             xd->eobs[12]);

+void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,

+                       int stride, int eob) {

+  if (tx_type == DCT_DCT) {

+    vp9_idct_add_8x8(input, dest, stride, eob);

+  } else {

+    if (eob > 0) {

+      vp9_short_iht8x8_add(input, dest, stride, tx_type);

+      vpx_memset(input, 0, 128);

+    }

+  }

-void vp9_dequant_idct_add_uv_block_8x8_c(int16_t *q, const int16_t *dq,

-                                         uint8_t *pre,

-                                         uint8_t *dstu,

-                                         uint8_t *dstv,

-                                         int stride, MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, xd->eobs[16]);

-  q    += 64;

-  pre  += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, xd->eobs[20]);

+void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {

+  if (eob > 1) {

+    vp9_short_idct4x4_add(input, dest, stride);

+    vpx_memset(input, 0, 32);

+  } else {

+    vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);

+    ((int *)input)[0] = 0;

+  }

-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

-                                                 uint8_t *dstu,

-                                                 uint8_t *dstv,

-                                                 int stride,

-                                                 MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride,

-                             xd->eobs[16]);

-  q += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride,

-                             xd->eobs[20]);

+void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,

+                             int eob) {

+  if (eob > 1) {

+    vp9_short_iwalsh4x4_add(input, dest, stride);

+    vpx_memset(input, 0, 32);

+  } else {

+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);

+    ((int *)input)[0] = 0;

+  }

+void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {

+  // If dc is 1, then input[0] is the reconstructed value, do not need

+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

-void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,

-                                             uint8_t *pre,

-                                             uint8_t *dst,

-                                             int stride, MACROBLOCKD *xd) {

-  int i, j;

+  // The calculation can be simplified if there are not many non-zero dct

+  // coefficients. Use eobs to decide what to do.

+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

+  // Combine that with code here.

+  if (eob) {

+    if (eob == 1) {

+      // DC only DCT coefficient

+      int16_t in = input[0];

+      int16_t out;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride,

-                                      xd->eobs[i * 4 + j]);

-      q   += 16;

-      pre += 4;

-      dst += 4;

+      // Note: the idct1 will need to be modified accordingly whenever

+      // vp9_short_idct8x8_c() is modified.

+      vp9_short_idct1_8x8_c(&in, &out);

+      input[0] = 0;

+      vp9_add_constant_residual_8x8(out, dest, stride);

+    } else {

+      vp9_short_idct8x8_add(input, dest, stride);

+      vpx_memset(input, 0, 128);

+  }

+}

-    pre += 64 - 16;

-    dst += 4 * stride - 16;

+void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,

+                         int stride, int eob) {

+  if (tx_type == DCT_DCT) {

+    vp9_idct_add_16x16(input, dest, stride, eob);

+  } else {

+    if (eob > 0) {

+      vp9_short_iht16x16_add(input, dest, stride, tx_type);

+      vpx_memset(input, 0, 512);

+    }

-void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,

-                                              uint8_t *pre,

-                                              uint8_t *dstu,

-                                              uint8_t *dstv,

-                                              int stride,

-                                              MACROBLOCKD *xd) {

-  int i, j;

+void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {

+  /* The calculation can be simplified if there are not many non-zero dct

+   * coefficients. Use eobs to separate different cases. */

+  if (eob) {

+    if (eob == 1) {

+      /* DC only DCT coefficient. */

+      int16_t in = input[0];

+      int16_t out;

+      /* Note: the idct1 will need to be modified accordingly whenever

+       * vp9_short_idct16x16() is modified. */

+      vp9_short_idct1_16x16_c(&in, &out);

+      input[0] = 0;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride,

-                                      xd->eobs[16 + i * 2 + j]);

-      q    += 16;

-      pre  += 4;

-      dstu += 4;

+      vp9_add_constant_residual_16x16(out, dest, stride);

+    } else {

+      vp9_short_idct16x16_add(input, dest, stride);

+      vpx_memset(input, 0, 512);

-    pre  += 32 - 8;

-    dstu += 4 * stride - 8;

+}

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride,

-                                      xd->eobs[20 + i * 2 + j]);

-      q    += 16;

-      pre  += 4;

-      dstv += 4;

-    }

+void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);

-    pre  += 32 - 8;

-    dstv += 4 * stride - 8;

+  if (eob) {

+    if (eob == 1) {

+      vp9_short_idct1_32x32(input, output);

+      vp9_add_constant_residual_32x32(output[0], dest, stride);

+      input[0] = 0;

+    } else {

+      vp9_short_idct32x32_add(input, dest, stride);

+      vpx_memset(input, 0, 2048);

+    }

--- /dev/null

+++ b/vp9/decoder/vp9_idct_blk.h

@@ -1,0 +1,36 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_DECODER_VP9_IDCT_BLK_H_

+#define VP9_DECODER_VP9_IDCT_BLK_H_

+#include "vp9/common/vp9_blockd.h"

+void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride,

+                             int eob);

+void vp9_idct_add_y_block_lossless_c(int16_t *q, unsigned char *dst, int stride,

+                                     struct macroblockd *xd);

+void vp9_idct_add_uv_block_lossless_c(int16_t *q, unsigned char *dst,

+                                      int stride, uint16_t *eobs);

+void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,

+                   int stride, int eob);

+void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,

+                       int stride, int eob);

+void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,

+                         int stride, int eob);

+#endif  // VP9_DECODER_VP9_IDCT_BLK_H_

--- a/vp9/decoder/vp9_onyxd.h

+++ b/vp9/decoder/vp9_onyxd.h

@@ -11,54 +11,56 @@

 #ifndef VP9_COMMON_VP9_ONYXD_H_

 #define VP9_COMMON_VP9_ONYXD_H_

-/* Create/destroy static data structures. */

 #ifdef __cplusplus

 extern "C" {

 #endif

 #include "vpx_scale/yv12config.h"

 #include "vp9/common/vp9_ppflags.h"

-#include "vpx_ports/mem.h"

 #include "vpx/vpx_codec.h"

-  typedef void   *VP9D_PTR;

-  typedef struct {

-    int     Width;

-    int     Height;

-    int     Version;

-    int     postprocess;

-    int     max_threads;

-    int     inv_tile_order;

-    int     input_partition;

-  } VP9D_CONFIG;

-  typedef enum {

-    VP9_LAST_FLAG = 1,

-    VP9_GOLD_FLAG = 2,

-    VP9_ALT_FLAG = 4

-  } VP9_REFFRAME;

+typedef void *VP9D_PTR;

-  void vp9_initialize_dec(void);

+typedef struct {

+  int width;

+  int height;

+  int version;

+  int postprocess;

+  int max_threads;

+  int inv_tile_order;

+  int input_partition;

+} VP9D_CONFIG;

-  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,

-                                  const unsigned char **dest,

-                                  int64_t time_stamp);

+typedef enum {

+  VP9_LAST_FLAG = 1,

+  VP9_GOLD_FLAG = 2,

+  VP9_ALT_FLAG = 4

+} VP9_REFFRAME;

-  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,

-                        int64_t *time_stamp, int64_t *time_end_stamp,

-                        vp9_ppflags_t *flags);

+void vp9_initialize_dec();

-  vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp,

-                                         VP9_REFFRAME ref_frame_flag,

-                                         YV12_BUFFER_CONFIG *sd);

+int vp9_receive_compressed_data(VP9D_PTR comp,

+                                uint64_t size, const uint8_t **dest,

+                                int64_t time_stamp);

-  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,

-                                        VP9_REFFRAME ref_frame_flag,

-                                        YV12_BUFFER_CONFIG *sd);

+int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,

+                      int64_t *time_stamp, int64_t *time_end_stamp,

+                      vp9_ppflags_t *flags);

-  int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);

+vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp,

+                                       VP9_REFFRAME ref_frame_flag,

+                                       YV12_BUFFER_CONFIG *sd);

-  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);

+vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,

+                                      VP9_REFFRAME ref_frame_flag,

+                                      YV12_BUFFER_CONFIG *sd);

-  void vp9_remove_decompressor(VP9D_PTR comp);

+int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);

+VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);

+void vp9_remove_decompressor(VP9D_PTR comp);

 #ifdef __cplusplus

--- a/vp9/decoder/vp9_onyxd_if.c

+++ b/vp9/decoder/vp9_onyxd_if.c

@@ -21,8 +21,6 @@

 #include "vpx_mem/vpx_mem.h"

 #include "vp9/common/vp9_alloccommon.h"

 #include "vp9/common/vp9_loopfilter.h"

-#include "vp9/common/vp9_swapyv12buffer.h"

 #include "vp9/common/vp9_quant_common.h"

 #include "vpx_scale/vpx_scale.h"

 #include "vp9/common/vp9_systemdependent.h"

@@ -36,7 +34,7 @@

 static void recon_write_yuv_frame(const char *name,

                                   const YV12_BUFFER_CONFIG *s,

                                   int w, int _h) {

-  FILE *yuv_file = fopen((char *)name, "ab");

+  FILE *yuv_file = fopen(name, "ab");

   const uint8_t *src = s->y_buffer;

   int h = _h;

@@ -111,7 +109,7 @@

 VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {

-  VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));

+  VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP));

   if (!pbi)

     return NULL;

@@ -121,7 +119,7 @@

   if (setjmp(pbi->common.error.jmp)) {

     pbi->common.error.setjmp = 0;

     vp9_remove_decompressor(pbi);

-    return 0;

+    return NULL;

   pbi->common.error.setjmp = 1;

@@ -128,33 +126,30 @@

   vp9_initialize_dec();

   vp9_create_common(&pbi->common);

-  pbi->oxcf = *oxcf;

+  pbi->oxcf = *oxcf;

   pbi->common.current_video_frame = 0;

   pbi->ready_for_new_data = 1;

-  /* vp9_init_de_quantizer() is first called here. Add check in

-   * frame_init_dequantizer() to avoid unnecessary calling of

-   * vp9_init_de_quantizer() for every frame.

-   */

-  vp9_init_de_quantizer(pbi);

+  // vp9_init_dequantizer() is first called here. Add check in

+  // frame_init_dequantizer() to avoid unnecessary calling of

+  // vp9_init_dequantizer() for every frame.

+  vp9_init_dequantizer(&pbi->common);

   vp9_loop_filter_init(&pbi->common);

   pbi->common.error.setjmp = 0;

   pbi->decoded_key_frame = 0;

-  return (VP9D_PTR) pbi;

+  return pbi;

 void vp9_remove_decompressor(VP9D_PTR ptr) {

-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

+  VP9D_COMP *const pbi = (VP9D_COMP *)ptr;

   if (!pbi)

     return;

-  // Delete segmentation map

   if (pbi->common.last_frame_seg_map)

     vpx_free(pbi->common.last_frame_seg_map);

@@ -252,7 +247,7 @@

   return 0;

-/* If any buffer updating is signalled it should be done here. */

+/* If any buffer updating is signaled it should be done here. */

 static void swap_frame_buffers(VP9D_COMP *pbi) {

   int ref_index = 0, mask;

@@ -273,24 +268,23 @@

     pbi->common.active_ref_idx[ref_index] = INT_MAX;

-int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,

-                                const unsigned char **psource,

+int vp9_receive_compressed_data(VP9D_PTR ptr,

+                                uint64_t size, const uint8_t **psource,

                                 int64_t time_stamp) {

   VP9D_COMP *pbi = (VP9D_COMP *) ptr;

   VP9_COMMON *cm = &pbi->common;

-  const unsigned char *source = *psource;

+  const uint8_t *source = *psource;

   int retcode = 0;

   /*if(pbi->ready_for_new_data == 0)

       return -1;*/

-  if (ptr == 0) {

+  if (ptr == 0)

     return -1;

-  }

   pbi->common.error.error_code = VPX_CODEC_OK;

-  pbi->Source = source;

+  pbi->source = source;

   pbi->source_sz = size;

   if (pbi->source_sz == 0) {

@@ -325,6 +319,7 @@

     if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

       cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

     return -1;

@@ -354,10 +349,20 @@

     if (cm->filter_level) {

       /* Apply the loop filter if appropriate. */

-      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0,

-                            cm->dering_enabled);

+      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);

-    vp8_yv12_extend_frame_borders(cm->frame_to_show);

+#if WRITE_RECON_BUFFER == 2

+    if (cm->show_frame)

+      write_dx_frame_to_file(cm->frame_to_show,

+                             cm->current_video_frame + 2000);

+    else

+      write_dx_frame_to_file(cm->frame_to_show,

+                             cm->current_video_frame + 3000);

+#endif

+    vp9_extend_frame_borders(cm->frame_to_show,

+                             cm->subsampling_x, cm->subsampling_y);

 #if WRITE_RECON_BUFFER == 1

@@ -368,19 +373,19 @@

   vp9_clear_system_state();

+  cm->last_show_frame = cm->show_frame;

   if (cm->show_frame) {

-    vpx_memcpy(cm->prev_mip, cm->mip,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  } else {

-    vpx_memset(cm->prev_mip, 0,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  }

+    // current mip will be the prev_mip for the next frame

+    MODE_INFO *temp = cm->prev_mip;

+    cm->prev_mip = cm->mip;

+    cm->mip = temp;

-  /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,

-                                       cm->current_video_frame);*/

+    // update the upper left visible macroblock ptrs

+    cm->mi = cm->mip + cm->mode_info_stride + 1;

+    cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;

-  if (cm->show_frame)

     cm->current_video_frame++;

+  }

   pbi->ready_for_new_data = 0;

   pbi->last_time_stamp = time_stamp;

--- a/vp9/decoder/vp9_onyxd_int.h

+++ b/vp9/decoder/vp9_onyxd_int.h

@@ -14,7 +14,7 @@

 #include "vp9/decoder/vp9_onyxd.h"

 #include "vp9/decoder/vp9_treereader.h"

 #include "vp9/common/vp9_onyxc_int.h"

-#include "vp9/decoder/vp9_dequantize.h"

+#include "vp9/decoder/vp9_idct_blk.h"

 // #define DEC_DEBUG

@@ -25,13 +25,12 @@

   VP9D_CONFIG oxcf;

+  const uint8_t *source;

+  uint32_t source_sz;

-  const unsigned char *Source;

-  unsigned int   source_sz;

   vp9_reader *mbc;

   int64_t last_time_stamp;

-  int   ready_for_new_data;

+  int ready_for_new_data;

   int refresh_frame_flags;

   vp9_prob prob_skip_false;

@@ -41,8 +40,6 @@

   int initial_width;

   int initial_height;

 } VP9D_COMP;

-int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end);

 #if CONFIG_DEBUG

--- /dev/null

+++ b/vp9/decoder/vp9_read_bit_buffer.h

@@ -1,0 +1,54 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_READ_BIT_BUFFER_

+#define VP9_READ_BIT_BUFFER_

+#include <limits.h>

+#include "vpx/vpx_integer.h"

+typedef void (*vp9_rb_error_handler)(void *data, int bit_offset);

+struct vp9_read_bit_buffer {

+  const uint8_t *bit_buffer;

+  const uint8_t *bit_buffer_end;

+  size_t bit_offset;

+  void *error_handler_data;

+  vp9_rb_error_handler error_handler;

+};

+static size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {

+  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);

+}

+static int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {

+  const int off = rb->bit_offset;

+  const int p = off / CHAR_BIT;

+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;

+  if (rb->bit_buffer + p >= rb->bit_buffer_end) {

+    rb->error_handler(rb->error_handler_data, rb->bit_offset);

+    return 0;

+  } else {

+    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;

+    rb->bit_offset = off + 1;

+    return bit;

+  }

+}

+static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {

+  int value = 0, bit;

+  for (bit = bits - 1; bit >= 0; bit--)

+    value |= vp9_rb_read_bit(rb) << bit;

+  return value;

+}

+#endif  // VP9_READ_BIT_BUFFER_

--- a/vp9/decoder/vp9_treereader.h

+++ b/vp9/decoder/vp9_treereader.h

@@ -15,12 +15,8 @@

 #include "vp9/common/vp9_treecoder.h"

 #include "vp9/decoder/vp9_dboolhuff.h"

-typedef BOOL_DECODER vp9_reader;

-#define vp9_read decode_bool

-#define vp9_read_literal decode_value

-#define vp9_read_bit(r) vp9_read(r, vp9_prob_half)

 #define vp9_read_prob(r) ((vp9_prob)vp9_read_literal(r, 8))

+#define vp9_read_and_apply_sign(r, value) (vp9_read_bit(r) ? -(value) : (value))

 // Intent of tree data structure is to make decoding trivial.

 static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */

--- a/vp9/decoder/x86/vp9_dequantize_sse2.c

+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c

@@ -15,249 +15,20 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_idct.h"

-void vp9_add_residual_4x4_sse2(const int16_t *diff, const uint8_t *pred,

-                               int pitch, uint8_t *dest, int stride) {

-  const int width = 4;

-  const __m128i zero = _mm_setzero_si128();

-  // Diff data

-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));

-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));

-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));

-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));

-  // Prediction data.

-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(pred + 0 * pitch));

-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(pred + 1 * pitch));

-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(pred + 2 * pitch));

-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(pred + 3 * pitch));

-  p0 = _mm_unpacklo_epi8(p0, zero);

-  p1 = _mm_unpacklo_epi8(p1, zero);

-  p2 = _mm_unpacklo_epi8(p2, zero);

-  p3 = _mm_unpacklo_epi8(p3, zero);

-  p0 = _mm_add_epi16(p0, d0);

-  p1 = _mm_add_epi16(p1, d1);

-  p2 = _mm_add_epi16(p2, d2);

-  p3 = _mm_add_epi16(p3, d3);

-  p0 = _mm_packus_epi16(p0, p1);

-  p2 = _mm_packus_epi16(p2, p3);

-  *(int *)dest = _mm_cvtsi128_si32(p0);

-  dest += stride;

-  p0 = _mm_srli_si128(p0, 8);

-  *(int *)dest = _mm_cvtsi128_si32(p0);

-  dest += stride;

-  *(int *)dest = _mm_cvtsi128_si32(p2);

-  dest += stride;

-  p2 = _mm_srli_si128(p2, 8);

-  *(int *)dest = _mm_cvtsi128_si32(p2);

-}

-void vp9_add_residual_8x8_sse2(const int16_t *diff, const uint8_t *pred,

-                               int pitch, uint8_t *dest, int stride) {

-  const int width = 8;

-  const __m128i zero = _mm_setzero_si128();

-  // Diff data

-  const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

-  const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));

-  const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));

-  const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));

-  const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));

-  const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));

-  const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));

-  const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));

-  // Prediction data.

-  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));

-  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));

-  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));

-  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));

-  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));

-  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));

-  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));

-  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));

-  p0 = _mm_unpacklo_epi8(p0, zero);

-  p1 = _mm_unpacklo_epi8(p1, zero);

-  p2 = _mm_unpacklo_epi8(p2, zero);

-  p3 = _mm_unpacklo_epi8(p3, zero);

-  p4 = _mm_unpacklo_epi8(p4, zero);

-  p5 = _mm_unpacklo_epi8(p5, zero);

-  p6 = _mm_unpacklo_epi8(p6, zero);

-  p7 = _mm_unpacklo_epi8(p7, zero);

-  p0 = _mm_add_epi16(p0, d0);

-  p1 = _mm_add_epi16(p1, d1);

-  p2 = _mm_add_epi16(p2, d2);

-  p3 = _mm_add_epi16(p3, d3);

-  p4 = _mm_add_epi16(p4, d4);

-  p5 = _mm_add_epi16(p5, d5);

-  p6 = _mm_add_epi16(p6, d6);

-  p7 = _mm_add_epi16(p7, d7);

-  p0 = _mm_packus_epi16(p0, p1);

-  p2 = _mm_packus_epi16(p2, p3);

-  p4 = _mm_packus_epi16(p4, p5);

-  p6 = _mm_packus_epi16(p6, p7);

-  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);

-  p0 = _mm_srli_si128(p0, 8);

-  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);

-  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);

-  p2 = _mm_srli_si128(p2, 8);

-  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);

-  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);

-  p4 = _mm_srli_si128(p4, 8);

-  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);

-  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);

-  p6 = _mm_srli_si128(p6, 8);

-  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);

-}

-void vp9_add_residual_16x16_sse2(const int16_t *diff, const uint8_t *pred,

-                             int pitch, uint8_t *dest, int stride) {

-  const int width = 16;

-  int i = 4;

-  const __m128i zero = _mm_setzero_si128();

-  // Diff data

-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;

-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;

-  do {

-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));

-    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));

-    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));

-    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));

-    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));

-    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));

-    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));

-    // Prediction data.

-    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

-    p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

-    p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));

-    p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));

-    p0 = _mm_unpacklo_epi8(p1, zero);

-    p1 = _mm_unpackhi_epi8(p1, zero);

-    p2 = _mm_unpacklo_epi8(p3, zero);

-    p3 = _mm_unpackhi_epi8(p3, zero);

-    p4 = _mm_unpacklo_epi8(p5, zero);

-    p5 = _mm_unpackhi_epi8(p5, zero);

-    p6 = _mm_unpacklo_epi8(p7, zero);

-    p7 = _mm_unpackhi_epi8(p7, zero);

-    p0 = _mm_add_epi16(p0, d0);

-    p1 = _mm_add_epi16(p1, d1);

-    p2 = _mm_add_epi16(p2, d2);

-    p3 = _mm_add_epi16(p3, d3);

-    p4 = _mm_add_epi16(p4, d4);

-    p5 = _mm_add_epi16(p5, d5);

-    p6 = _mm_add_epi16(p6, d6);

-    p7 = _mm_add_epi16(p7, d7);

-    p0 = _mm_packus_epi16(p0, p1);

-    p1 = _mm_packus_epi16(p2, p3);

-    p2 = _mm_packus_epi16(p4, p5);

-    p3 = _mm_packus_epi16(p6, p7);

-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

-    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);

-    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);

-    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);

-    diff += 4 * width;

-    pred += 4 * pitch;

-    dest += 4 * stride;

-  } while (--i);

-}

-void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred,

-                             int pitch, uint8_t *dest, int stride) {

-  const int width = 32;

-  int i = 16;

-  const __m128i zero = _mm_setzero_si128();

-  // Diff data

-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;

-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;

-  do {

-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));

-    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));

-    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));

-    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));

-    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));

-    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));

-    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));

-    // Prediction data.

-    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

-    p3 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));

-    p5 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

-    p7 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));

-    p0 = _mm_unpacklo_epi8(p1, zero);

-    p1 = _mm_unpackhi_epi8(p1, zero);

-    p2 = _mm_unpacklo_epi8(p3, zero);

-    p3 = _mm_unpackhi_epi8(p3, zero);

-    p4 = _mm_unpacklo_epi8(p5, zero);

-    p5 = _mm_unpackhi_epi8(p5, zero);

-    p6 = _mm_unpacklo_epi8(p7, zero);

-    p7 = _mm_unpackhi_epi8(p7, zero);

-    p0 = _mm_add_epi16(p0, d0);

-    p1 = _mm_add_epi16(p1, d1);

-    p2 = _mm_add_epi16(p2, d2);

-    p3 = _mm_add_epi16(p3, d3);

-    p4 = _mm_add_epi16(p4, d4);

-    p5 = _mm_add_epi16(p5, d5);

-    p6 = _mm_add_epi16(p6, d6);

-    p7 = _mm_add_epi16(p7, d7);

-    p0 = _mm_packus_epi16(p0, p1);

-    p1 = _mm_packus_epi16(p2, p3);

-    p2 = _mm_packus_epi16(p4, p5);

-    p3 = _mm_packus_epi16(p6, p7);

-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);

-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);

-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);

-    diff += 2 * width;

-    pred += 2 * pitch;

-    dest += 2 * stride;

-  } while (--i);

-}

-void vp9_add_constant_residual_8x8_sse2(const int16_t diff, const uint8_t *pred,

-                                        int pitch, uint8_t *dest, int stride) {

+void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,

+                                        int stride) {

   uint8_t abs_diff;

   __m128i d;

   // Prediction data.

-  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));

-  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));

-  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));

-  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));

-  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));

-  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));

-  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));

-  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));

+  __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));

+  __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));

+  __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));

+  __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));

+  __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));

+  __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));

+  __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));

+  __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));

   p0 = _mm_unpacklo_epi64(p0, p1);

   p2 = _mm_unpacklo_epi64(p2, p3);

@@ -301,29 +72,28 @@

   _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);

-void vp9_add_constant_residual_16x16_sse2(const int16_t diff,

-                                          const uint8_t *pred, int pitch,

-                                          uint8_t *dest, int stride) {

+void vp9_add_constant_residual_16x16_sse2(const int16_t diff, uint8_t *dest,

+                                          int stride) {

   uint8_t abs_diff;

   __m128i d;

   // Prediction data.

-  __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

-  __m128i p1 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

-  __m128i p2 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));

-  __m128i p3 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));

-  __m128i p4 = _mm_load_si128((const __m128i *)(pred + 4 * pitch));

-  __m128i p5 = _mm_load_si128((const __m128i *)(pred + 5 * pitch));

-  __m128i p6 = _mm_load_si128((const __m128i *)(pred + 6 * pitch));

-  __m128i p7 = _mm_load_si128((const __m128i *)(pred + 7 * pitch));

-  __m128i p8 = _mm_load_si128((const __m128i *)(pred + 8 * pitch));

-  __m128i p9 = _mm_load_si128((const __m128i *)(pred + 9 * pitch));

-  __m128i p10 = _mm_load_si128((const __m128i *)(pred + 10 * pitch));

-  __m128i p11 = _mm_load_si128((const __m128i *)(pred + 11 * pitch));

-  __m128i p12 = _mm_load_si128((const __m128i *)(pred + 12 * pitch));

-  __m128i p13 = _mm_load_si128((const __m128i *)(pred + 13 * pitch));

-  __m128i p14 = _mm_load_si128((const __m128i *)(pred + 14 * pitch));

-  __m128i p15 = _mm_load_si128((const __m128i *)(pred + 15 * pitch));

+  __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));

+  __m128i p1 = _mm_load_si128((const __m128i *)(dest + 1 * stride));

+  __m128i p2 = _mm_load_si128((const __m128i *)(dest + 2 * stride));

+  __m128i p3 = _mm_load_si128((const __m128i *)(dest + 3 * stride));

+  __m128i p4 = _mm_load_si128((const __m128i *)(dest + 4 * stride));

+  __m128i p5 = _mm_load_si128((const __m128i *)(dest + 5 * stride));

+  __m128i p6 = _mm_load_si128((const __m128i *)(dest + 6 * stride));

+  __m128i p7 = _mm_load_si128((const __m128i *)(dest + 7 * stride));

+  __m128i p8 = _mm_load_si128((const __m128i *)(dest + 8 * stride));

+  __m128i p9 = _mm_load_si128((const __m128i *)(dest + 9 * stride));

+  __m128i p10 = _mm_load_si128((const __m128i *)(dest + 10 * stride));

+  __m128i p11 = _mm_load_si128((const __m128i *)(dest + 11 * stride));

+  __m128i p12 = _mm_load_si128((const __m128i *)(dest + 12 * stride));

+  __m128i p13 = _mm_load_si128((const __m128i *)(dest + 13 * stride));

+  __m128i p14 = _mm_load_si128((const __m128i *)(dest + 14 * stride));

+  __m128i p15 = _mm_load_si128((const __m128i *)(dest + 15 * stride));

   // Clip diff value to [0, 255] range. Then, do addition or subtraction

   // according to its sign.

@@ -388,9 +158,8 @@

   _mm_store_si128((__m128i *)(dest + 15 * stride), p15);

-void vp9_add_constant_residual_32x32_sse2(const int16_t diff,

-                                          const uint8_t *pred, int pitch,

-                                          uint8_t *dest, int stride) {

+void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest,

+                                          int stride) {

   uint8_t abs_diff;

   __m128i d;

   int i = 8;

@@ -405,14 +174,14 @@

   do {

     // Prediction data.

-    __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

-    __m128i p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));

-    __m128i p2 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

-    __m128i p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));

-    __m128i p4 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));

-    __m128i p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch + 16));

-    __m128i p6 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));

-    __m128i p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch + 16));

+    __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));

+    __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));

+    __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride));

+    __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));

+    __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride));

+    __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16));

+    __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride));

+    __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16));

     // Clip diff value to [0, 255] range. Then, do addition or subtraction

     // according to its sign.

@@ -446,7 +215,6 @@

     _mm_store_si128((__m128i *)(dest + 3 * stride), p6);

     _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);

-    pred += 4 * pitch;

     dest += 4 * stride;

   } while (--i);

--- a/vp9/decoder/x86/vp9_idct_blk_sse2.c

+++ /dev/null

@@ -1,117 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/decoder/vp9_dequantize.h"

-void vp9_idct_dequant_dc_0_2x_sse2(short *q, const short *dq,

-                                   unsigned char *pre, unsigned char *dst,

-                                   int dst_stride, const short *dc);

-void vp9_idct_dequant_dc_full_2x_sse2(short *q, const short *dq,

-                                      unsigned char *pre, unsigned char *dst,

-                                      int dst_stride, const short *dc);

-void vp9_idct_dequant_0_2x_sse2(short *q, const short *dq,

-                                unsigned char *pre, unsigned char *dst,

-                                int dst_stride, int blk_stride);

-void vp9_idct_dequant_full_2x_sse2(short *q, const short *dq,

-                                   unsigned char *pre, unsigned char *dst,

-                                   int dst_stride, int blk_stride);

-void vp9_dequant_dc_idct_add_y_block_sse2(short *q, const short *dq,

-                                          unsigned char *pre,

-                                          unsigned char *dst,

-                                          int stride, unsigned short *eobs,

-                                          const short *dc) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (((short *)(eobs))[0] & 0xfefe)

-      vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);

-    else

-      vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);

-    if (((short *)(eobs))[1] & 0xfefe)

-      vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,

-                                       stride, dc + 2);

-    else

-      vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,

-                                    stride, dc + 2);

-    q    += 64;

-    dc   += 4;

-    pre  += 64;

-    dst  += stride * 4;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_y_block_sse2(short *q, const short *dq,

-                                       unsigned char *pre, unsigned char *dst,

-                                       int stride, unsigned short *eobs) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (((short *)(eobs))[0] & 0xfefe)

-      vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);

-    else

-      vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);

-    if (((short *)(eobs))[1] & 0xfefe)

-      vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);

-    else

-      vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);

-    q    += 64;

-    pre  += 64;

-    dst  += stride * 4;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_uv_block_sse2(short *q, const short *dq,

-                                        unsigned char *pre,

-                                        unsigned char *dstu,

-                                        unsigned char *dstv,

-                                        int stride, unsigned short *eobs) {

-  if (((short *)(eobs))[0] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);

-  q    += 32;

-  pre  += 32;

-  dstu += stride * 4;

-  if (((short *)(eobs))[1] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);

-  q    += 32;

-  pre  += 32;

-  if (((short *)(eobs))[2] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);

-  q    += 32;

-  pre  += 32;

-  dstv += stride * 4;

-  if (((short *)(eobs))[3] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);

-}

--- a/vp9/encoder/ppc/vp9_csystemdependent.c

+++ /dev/null

@@ -1,155 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp9/encoder/vp9_variance.h"

-#include "vp9/encoder/vp9_onyx_int.h"

-SADFunction *vp9_sad16x16;

-SADFunction *vp9_sad16x8;

-SADFunction *vp9_sad8x16;

-SADFunction *vp9_sad8x8;

-SADFunction *vp9_sad4x4;

-variance_function *vp9_variance4x4;

-variance_function *vp9_variance8x8;

-variance_function *vp9_variance8x16;

-variance_function *vp9_variance16x8;

-variance_function *vp9_variance16x16;

-variance_function *vp9_mse16x16;

-sub_pixel_variance_function *vp9_sub_pixel_variance4x4;

-sub_pixel_variance_function *vp9_sub_pixel_variance8x8;

-sub_pixel_variance_function *vp9_sub_pixel_variance8x16;

-sub_pixel_variance_function *vp9_sub_pixel_variance16x8;

-sub_pixel_variance_function *vp9_sub_pixel_variance16x16;

-int (*vp9_block_error)(short *coeff, short *dqcoeff);

-int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);

-int (*vp9_mbuverror)(MACROBLOCK *mb);

-unsigned int (*vp9_get_mb_ss)(short *);

-void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);

-void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);

-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);

-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);

-void (*short_walsh4x4)(short *input, short *output, int pitch);

-void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);

-void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);

-void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);

-// c imports

-extern int block_error_c(short *coeff, short *dqcoeff);

-extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);

-extern int vp9_mbuverror_c(MACROBLOCK *mb);

-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-extern void short_fdct4x4_c(short *input, short *output, int pitch);

-extern void short_fdct8x4_c(short *input, short *output, int pitch);

-extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);

-extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);

-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);

-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);

-extern SADFunction sad16x16_c;

-extern SADFunction sad16x8_c;

-extern SADFunction sad8x16_c;

-extern SADFunction sad8x8_c;

-extern SADFunction sad4x4_c;

-extern variance_function variance16x16_c;

-extern variance_function variance8x16_c;

-extern variance_function variance16x8_c;

-extern variance_function variance8x8_c;

-extern variance_function variance4x4_c;

-extern variance_function mse16x16_c;

-extern sub_pixel_variance_function sub_pixel_variance4x4_c;

-extern sub_pixel_variance_function sub_pixel_variance8x8_c;

-extern sub_pixel_variance_function sub_pixel_variance8x16_c;

-extern sub_pixel_variance_function sub_pixel_variance16x8_c;

-extern sub_pixel_variance_function sub_pixel_variance16x16_c;

-extern unsigned int vp9_get_mb_ss_c(short *);

-// ppc

-extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);

-extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);

-extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);

-extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);

-extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

-extern SADFunction vp9_sad16x16_ppc;

-extern SADFunction vp9_sad16x8_ppc;

-extern SADFunction vp9_sad8x16_ppc;

-extern SADFunction vp9_sad8x8_ppc;

-extern SADFunction vp9_sad4x4_ppc;

-extern variance_function vp9_variance16x16_ppc;

-extern variance_function vp9_variance8x16_ppc;

-extern variance_function vp9_variance16x8_ppc;

-extern variance_function vp9_variance8x8_ppc;

-extern variance_function vp9_variance4x4_ppc;

-extern variance_function vp9_mse16x16_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;

-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-void vp9_cmachine_specific_config(void) {

-  // Pure C:

-  vp9_mbuverror               = vp9_mbuverror_c;

-  vp8_fast_quantize_b           = vp8_fast_quantize_b_c;

-  vp9_short_fdct4x4            = vp9_short_fdct4x4_ppc;

-  vp9_short_fdct8x4            = vp9_short_fdct8x4_ppc;

-  vp8_fast_fdct4x4             = vp9_short_fdct4x4_ppc;

-  vp8_fast_fdct8x4             = vp9_short_fdct8x4_ppc;

-  short_walsh4x4               = vp9_short_walsh4x4_c;

-  vp9_variance4x4             = vp9_variance4x4_ppc;

-  vp9_variance8x8             = vp9_variance8x8_ppc;

-  vp9_variance8x16            = vp9_variance8x16_ppc;

-  vp9_variance16x8            = vp9_variance16x8_ppc;

-  vp9_variance16x16           = vp9_variance16x16_ppc;

-  vp9_mse16x16                = vp9_mse16x16_ppc;

-  vp9_sub_pixel_variance4x4     = vp9_sub_pixel_variance4x4_ppc;

-  vp9_sub_pixel_variance8x8     = vp9_sub_pixel_variance8x8_ppc;

-  vp9_sub_pixel_variance8x16    = vp9_sub_pixel_variance8x16_ppc;

-  vp9_sub_pixel_variance16x8    = vp9_sub_pixel_variance16x8_ppc;

-  vp9_sub_pixel_variance16x16   = vp9_sub_pixel_variance16x16_ppc;

-  vp9_get_mb_ss                 = vp9_get_mb_ss_c;

-  vp9_sad16x16                = vp9_sad16x16_ppc;

-  vp9_sad16x8                 = vp9_sad16x8_ppc;

-  vp9_sad8x16                 = vp9_sad8x16_ppc;

-  vp9_sad8x8                  = vp9_sad8x8_ppc;

-  vp9_sad4x4                  = vp9_sad4x4_ppc;

-  vp9_block_error              = vp9_block_error_ppc;

-  vp9_mbblock_error            = vp9_mbblock_error_c;

-  vp9_subtract_b               = vp9_subtract_b_c;

-  vp9_subtract_mby             = vp9_subtract_mby_ppc;

-  vp9_subtract_mbuv            = vp9_subtract_mbuv_ppc;

-}

--- a/vp9/encoder/ppc/vp9_encodemb_altivec.asm

+++ /dev/null

@@ -1,153 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_subtract_mbuv_ppc

-    .globl vp8_subtract_mby_ppc

-;# r3 short *diff

-;# r4 unsigned char *usrc

-;# r5 unsigned char *vsrc

-;# r6 unsigned char *pred

-;# r7 int stride

-vp8_subtract_mbuv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf000

-    mtspr   256, r12            ;# set VRSAVE

-    li      r9, 256

-    add     r3, r3, r9

-    add     r3, r3, r9

-    add     r6, r6, r9

-    li      r10, 16

-    li      r9,  4

-    mtctr   r9

-    vspltisw v0, 0

-mbu_loop:

-    lvsl    v5, 0, r4           ;# permutate value for alignment

-    lvx     v1, 0, r4           ;# src

-    lvx     v2, 0, r6           ;# pred

-    add     r4, r4, r7

-    addi    r6, r6, 16

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrghb  v4, v0, v2          ;# unpack high pred to short

-    lvsl    v5, 0, r4           ;# permutate value for alignment

-    lvx     v1, 0, r4           ;# src

-    add     r4, r4, r7

-    vsubshs v3, v3, v4

-    stvx    v3, 0, r3           ;# store out diff

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrglb  v4, v0, v2          ;# unpack high pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, r10, r3         ;# store out diff

-    addi    r3, r3, 32

-    bdnz    mbu_loop

-    mtctr   r9

-mbv_loop:

-    lvsl    v5, 0, r5           ;# permutate value for alignment

-    lvx     v1, 0, r5           ;# src

-    lvx     v2, 0, r6           ;# pred

-    add     r5, r5, r7

-    addi    r6, r6, 16

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrghb  v4, v0, v2          ;# unpack high pred to short

-    lvsl    v5, 0, r5           ;# permutate value for alignment

-    lvx     v1, 0, r5           ;# src

-    add     r5, r5, r7

-    vsubshs v3, v3, v4

-    stvx    v3, 0, r3           ;# store out diff

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrglb  v4, v0, v2          ;# unpack high pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, r10, r3         ;# store out diff

-    addi    r3, r3, 32

-    bdnz    mbv_loop

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# r3 short *diff

-;# r4 unsigned char *src

-;# r5 unsigned char *pred

-;# r6 int stride

-vp8_subtract_mby_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf800

-    mtspr   256, r12            ;# set VRSAVE

-    li      r10, 16

-    mtctr   r10

-    vspltisw v0, 0

-mby_loop:

-    lvx     v1, 0, r4           ;# src

-    lvx     v2, 0, r5           ;# pred

-    add     r4, r4, r6

-    addi    r5, r5, 16

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrghb  v4, v0, v2          ;# unpack high pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, 0, r3           ;# store out diff

-    vmrglb  v3, v0, v1          ;# unpack low src  to short

-    vmrglb  v4, v0, v2          ;# unpack low pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, r10, r3         ;# store out diff

-    addi    r3, r3, 32

-    bdnz    mby_loop

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

--- a/vp9/encoder/ppc/vp9_fdct_altivec.asm

+++ /dev/null

@@ -1,205 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_short_fdct4x4_ppc

-    .globl vp8_short_fdct8x4_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-;# Forward and inverse DCTs are nearly identical; only differences are

-;#   in normalization (fwd is twice unitary, inv is half unitary)

-;#   and that they are of course transposes of each other.

-;#

-;#   The following three accomplish most of implementation and

-;#   are used only by ppc_idct.c and ppc_fdct.c.

-.macro prologue

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfffc

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    li      r6, 16

-    load_c v0, dct_tab, 0, r9, r10

-    lvx     v1,   r6, r10

-    addi    r10, r10, 32

-    lvx     v2,    0, r10

-    lvx     v3,   r6, r10

-    load_c v4, ppc_dctperm_tab,  0, r9, r10

-    load_c v5, ppc_dctperm_tab, r6, r9, r10

-    load_c v6, round_tab, 0, r10, r9

-.endm

-.macro epilogue

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-.endm

-;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.

-;#   a/A are the even rows 0,2   b/B are the odd rows 1,3

-;#   For fwd transform, indices are horizontal positions, then frequencies.

-;#   For inverse transform, frequencies then positions.

-;#   The two resulting  A0..A3  B0..B3  are later combined

-;#   and vertically transformed.

-.macro two_rows_horiz Dst

-    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1

-    vmsumshm v10, v0, v8, v6

-    vmsumshm v10, v1, v9, v10

-    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1

-    vmsumshm v11, v2, v8, v6

-    vmsumshm v11, v3, v9, v11

-    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3

-    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3

-    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3

-.endm

-;# Vertical xf on two rows. DCT values in comments are for inverse transform;

-;#   forward transform uses transpose.

-.macro two_rows_vert Ceven, Codd

-    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times

-    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""

-    vmsumshm v8, v8, v12, v6

-    vmsumshm v8, v9, v13, v8

-    vsraw   v10, v8, v7

-    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13

-    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33

-    vmsumshm v8, v8, v12, v6

-    vmsumshm v8, v9, v13, v8

-    vsraw   v8, v8, v7

-    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3

-.endm

-.macro two_rows_h Dest

-    stw     r0,  0(r8)

-    lwz     r0,  4(r3)

-    stw     r0,  4(r8)

-    lwzux   r0, r3,r5

-    stw     r0,  8(r8)

-    lwz     r0,  4(r3)

-    stw     r0, 12(r8)

-    lvx     v8,  0,r8

-    two_rows_horiz \Dest

-.endm

-    .align 2

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-vp8_short_fdct4x4_ppc:

-    prologue

-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

-    addi    r8, r1, 0

-    lwz     r0, 0(r3)

-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

-    lwzux   r0, r3, r5

-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

-    lvx     v6, r6, r9          ;# v6 = Vround

-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

-    two_rows_vert v0, v1

-    stvx    v8, 0, r4

-    two_rows_vert v2, v3

-    stvx    v8, r6, r4

-    epilogue

-    blr

-    .align 2

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-vp8_short_fdct8x4_ppc:

-    prologue

-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

-    addi    r8,  r1, 0

-    addi    r10, r3, 0

-    lwz     r0, 0(r3)

-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

-    lwzux   r0, r3, r5

-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

-    lvx     v6, r6, r9          ;# v6 = Vround

-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

-    two_rows_vert v0, v1

-    stvx    v8, 0, r4

-    two_rows_vert v2, v3

-    stvx    v8, r6, r4

-    ;# Next block

-    addi    r3, r10, 8

-    addi    r4, r4, 32

-    lvx     v6, 0, r9           ;# v6 = Hround

-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

-    addi    r8, r1, 0

-    lwz     r0, 0(r3)

-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

-    lwzux   r0, r3, r5

-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

-    lvx     v6, r6, r9          ;# v6 = Vround

-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

-    two_rows_vert v0, v1

-    stvx    v8, 0, r4

-    two_rows_vert v2, v3

-    stvx    v8, r6, r4

-    epilogue

-    blr

-    .data

-    .align 4

-ppc_dctperm_tab:

-    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11

-    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15

-    .align 4

-dct_tab:

-    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274

-    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540

-    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540

-    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274

-    .align 4

-round_tab:

-    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))

-    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))

--- a/vp9/encoder/ppc/vp9_rdopt_altivec.asm

+++ /dev/null

@@ -1,51 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_block_error_ppc

-    .align 2

-;# r3 short *Coeff

-;# r4 short *dqcoeff

-vp8_block_error_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf800

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    stw     r5, 12(r1)          ;# tranfer dc to vector register

-    lvx     v0, 0, r3           ;# Coeff

-    lvx     v1, 0, r4           ;# dqcoeff

-    li      r10, 16

-    vspltisw v3, 0

-    vsubshs v0, v0, v1

-    vmsumshm v2, v0, v0, v3     ;# multiply differences

-    lvx     v0, r10, r3         ;# Coeff

-    lvx     v1, r10, r4         ;# dqcoeff

-    vsubshs v0, v0, v1

-    vmsumshm v1, v0, v0, v2     ;# multiply differences

-    vsumsws v1, v1, v3          ;# sum up

-    stvx    v1, 0, r1

-    lwz     r3, 12(r1)          ;# return value

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

--- a/vp9/encoder/ppc/vp9_sad_altivec.asm

+++ /dev/null

@@ -1,277 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_sad16x16_ppc

-    .globl vp8_sad16x8_ppc

-    .globl vp8_sad8x16_ppc

-    .globl vp8_sad8x8_ppc

-    .globl vp8_sad4x4_ppc

-.macro load_aligned_16 V R O

-    lvsl    v3,  0, \R          ;# permutate value for alignment

-    lvx     v1,  0, \R

-    lvx     v2, \O, \R

-    vperm   \V, v1, v2, v3

-.endm

-.macro prologue

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    li      r10, 16             ;# load offset and loop counter

-    vspltisw v8, 0              ;# zero out total to start

-.endm

-.macro epilogue

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-.endm

-.macro SAD_16

-    ;# v6 = abs (v4 - v5)

-    vsububs v6, v4, v5

-    vsububs v7, v5, v4

-    vor     v6, v6, v7

-    ;# v8 += abs (v4 - v5)

-    vsum4ubs v8, v6, v8

-.endm

-.macro sad_16_loop loop_label

-    lvsl    v3,  0, r5          ;# only needs to be done once per block

-    ;# preload a line of data before getting into the loop

-    lvx     v4, 0, r3

-    lvx     v1,  0, r5

-    lvx     v2, r10, r5

-    add     r5, r5, r6

-    add     r3, r3, r4

-    vperm   v5, v1, v2, v3

-    .align 4

-\loop_label:

-    ;# compute difference on first row

-    vsububs v6, v4, v5

-    vsububs v7, v5, v4

-    ;# load up next set of data

-    lvx     v9, 0, r3

-    lvx     v1,  0, r5

-    lvx     v2, r10, r5

-    ;# perform abs() of difference

-    vor     v6, v6, v7

-    add     r3, r3, r4

-    ;# add to the running tally

-    vsum4ubs v8, v6, v8

-    ;# now onto the next line

-    vperm   v5, v1, v2, v3

-    add     r5, r5, r6

-    lvx     v4, 0, r3

-    ;# compute difference on second row

-    vsububs v6, v9, v5

-    lvx     v1,  0, r5

-    vsububs v7, v5, v9

-    lvx     v2, r10, r5

-    vor     v6, v6, v7

-    add     r3, r3, r4

-    vsum4ubs v8, v6, v8

-    vperm   v5, v1, v2, v3

-    add     r5, r5, r6

-    bdnz    \loop_label

-    vspltisw v7, 0

-    vsumsws v8, v8, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-.endm

-.macro sad_8_loop loop_label

-    .align 4

-\loop_label:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v6, r3, r10

-    load_aligned_16 v7, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    vmrghb  v4, v4, v6

-    vmrghb  v5, v5, v7

-    SAD_16

-    bdnz    \loop_label

-    vspltisw v7, 0

-    vsumsws v8, v8, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad16x16_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    sad_16_loop sad16x16_loop

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad16x8_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    sad_16_loop sad16x8_loop

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad8x16_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    sad_8_loop sad8x16_loop

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad8x8_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    sad_8_loop sad8x8_loop

-    epilogue

-    blr

-.macro transfer_4x4 I P

-    lwz     r0, 0(\I)

-    add     \I, \I, \P

-    lwz     r7, 0(\I)

-    add     \I, \I, \P

-    lwz     r8, 0(\I)

-    add     \I, \I, \P

-    lwz     r9, 0(\I)

-    stw     r0,  0(r1)

-    stw     r7,  4(r1)

-    stw     r8,  8(r1)

-    stw     r9, 12(r1)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad4x4_ppc:

-    prologue

-    transfer_4x4 r3, r4

-    lvx     v4, 0, r1

-    transfer_4x4 r5, r6

-    lvx     v5, 0, r1

-    vspltisw v8, 0              ;# zero out total to start

-    ;# v6 = abs (v4 - v5)

-    vsububs v6, v4, v5

-    vsububs v7, v5, v4

-    vor     v6, v6, v7

-    ;# v8 += abs (v4 - v5)

-    vsum4ubs v7, v6, v8

-    vsumsws v7, v7, v8

-    stvx    v7, 0, r1

-    lwz     r3, 12(r1)

-    epilogue

-    blr

--- a/vp9/encoder/ppc/vp9_variance_altivec.asm

+++ /dev/null

@@ -1,375 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_get8x8var_ppc

-    .globl vp8_get16x16var_ppc

-    .globl vp8_mse16x16_ppc

-    .globl vp9_variance16x16_ppc

-    .globl vp9_variance16x8_ppc

-    .globl vp9_variance8x16_ppc

-    .globl vp9_variance8x8_ppc

-    .globl vp9_variance4x4_ppc

-.macro load_aligned_16 V R O

-    lvsl    v3,  0, \R          ;# permutate value for alignment

-    lvx     v1,  0, \R

-    lvx     v2, \O, \R

-    vperm   \V, v1, v2, v3

-.endm

-.macro prologue

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    li      r10, 16             ;# load offset and loop counter

-    vspltisw v7, 0              ;# zero for merging

-    vspltisw v8, 0              ;# zero out total to start

-    vspltisw v9, 0              ;# zero out total for dif^2

-.endm

-.macro epilogue

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-.endm

-.macro compute_sum_sse

-    ;# Compute sum first.  Unpack to so signed subract

-    ;#  can be used.  Only have a half word signed

-    ;#  subract.  Do high, then low.

-    vmrghb  v2, v7, v4

-    vmrghb  v3, v7, v5

-    vsubshs v2, v2, v3

-    vsum4shs v8, v2, v8

-    vmrglb  v2, v7, v4

-    vmrglb  v3, v7, v5

-    vsubshs v2, v2, v3

-    vsum4shs v8, v2, v8

-    ;# Now compute sse.

-    vsububs v2, v4, v5

-    vsububs v3, v5, v4

-    vor     v2, v2, v3

-    vmsumubm v9, v2, v2, v9

-.endm

-.macro variance_16 DS loop_label store_sum

-\loop_label:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    compute_sum_sse

-    bdnz    \loop_label

-    vsumsws v8, v8, v7

-    vsumsws v9, v9, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r4, 12(r1)

-.if \store_sum

-    stw     r3, 0(r8)           ;# sum

-.endif

-    stw     r4, 0(r7)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, \DS         ;# (sum*sum) >> DS

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)

-.endm

-.macro variance_8 DS loop_label store_sum

-\loop_label:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v6, r3, r10

-    load_aligned_16 v0, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    vmrghb  v4, v4, v6

-    vmrghb  v5, v5, v0

-    compute_sum_sse

-    bdnz    \loop_label

-    vsumsws v8, v8, v7

-    vsumsws v9, v9, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r4, 12(r1)

-.if \store_sum

-    stw     r3, 0(r8)           ;# sum

-.endif

-    stw     r4, 0(r7)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *SSE

-;# r8 int *Sum

-;#

-;# r3 return value

-vp8_get8x8var_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    variance_8 6, get8x8var_loop, 1

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *SSE

-;# r8 int *Sum

-;#

-;# r3 return value

-vp8_get16x16var_ppc:

-    prologue

-    mtctr   r10

-    variance_16 8, get16x16var_loop, 1

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r 3 return value

-vp8_mse16x16_ppc:

-    prologue

-    mtctr   r10

-mse16x16_loop:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    ;# Now compute sse.

-    vsububs v2, v4, v5

-    vsububs v3, v5, v4

-    vor     v2, v2, v3

-    vmsumubm v9, v2, v2, v9

-    bdnz    mse16x16_loop

-    vsumsws v9, v9, v7

-    stvx    v9, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r3, 12(r1)

-    stw     r3, 0(r7)           ;# sse

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance16x16_ppc:

-    prologue

-    mtctr   r10

-    variance_16 8, variance16x16_loop, 0

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance16x8_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    variance_16 7, variance16x8_loop, 0

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance8x16_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    variance_8 7, variance8x16_loop, 0

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance8x8_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    variance_8 6, variance8x8_loop, 0

-    epilogue

-    blr

-.macro transfer_4x4 I P

-    lwz     r0, 0(\I)

-    add     \I, \I, \P

-    lwz     r10,0(\I)

-    add     \I, \I, \P

-    lwz     r8, 0(\I)

-    add     \I, \I, \P

-    lwz     r9, 0(\I)

-    stw     r0,  0(r1)

-    stw     r10, 4(r1)

-    stw     r8,  8(r1)

-    stw     r9, 12(r1)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance4x4_ppc:

-    prologue

-    transfer_4x4 r3, r4

-    lvx     v4, 0, r1

-    transfer_4x4 r5, r6

-    lvx     v5, 0, r1

-    compute_sum_sse

-    vsumsws v8, v8, v7

-    vsumsws v9, v9, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r4, 12(r1)

-    stw     r4, 0(r7)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, 4           ;# (sum*sum) >> 4

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)

-    epilogue

-    blr

--- a/vp9/encoder/ppc/vp9_variance_subpixel_altivec.asm

+++ /dev/null

@@ -1,865 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp9_sub_pixel_variance4x4_ppc

-    .globl vp9_sub_pixel_variance8x8_ppc

-    .globl vp9_sub_pixel_variance8x16_ppc

-    .globl vp9_sub_pixel_variance16x8_ppc

-    .globl vp9_sub_pixel_variance16x16_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-.macro load_vfilter V0, V1

-    load_c \V0, vfilter_b, r6, r12, r10

-    addi    r6,  r6, 16

-    lvx     \V1, r6, r10

-.endm

-.macro HProlog jump_label

-    ;# load up horizontal filter

-    slwi.   r5, r5, 4           ;# index into horizontal filter array

-    ;# index to the next set of vectors in the row.

-    li      r10, 16

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq     \jump_label

-    load_c v20, hfilter_b, r5, r12, r0

-    ;# setup constants

-    ;# v14 permutation value for alignment

-    load_c v28, b_hperm_b, 0, r12, r0

-    ;# index to the next set of vectors in the row.

-    li      r12, 32

-    ;# rounding added in on the multiply

-    vspltisw v21, 8

-    vspltisw v18, 3

-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-.endm

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro hfilter_8 V, hp, lp, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 9 bytes wide, output is 8 bytes.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456

-    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A

-    vmsummbm v24, v20, v24, v18

-    vmsummbm v25, v20, v25, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result

-.endm

-.macro vfilter_16 P0 P1

-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps

-    vadduhm v22, v18, v22

-    vmuloub v23, \P0, v20

-    vadduhm v23, v18, v23

-    vmuleub v24, \P1, v21

-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary

-    vmuloub v25, \P1, v21

-    vadduhm v23, v23, v25       ;# Ro = odds

-    vsrh    v22, v22, v19       ;# divide by 128

-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds

-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order

-    vmrglh  v23, v22, v23

-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result

-.endm

-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0

-    ;# Compute sum first.  Unpack to so signed subract

-    ;#  can be used.  Only have a half word signed

-    ;#  subract.  Do high, then low.

-    vmrghb  \t1, \z0, \src

-    vmrghb  \t2, \z0, \ref

-    vsubshs \t1, \t1, \t2

-    vsum4shs \sum, \t1, \sum

-    vmrglb  \t1, \z0, \src

-    vmrglb  \t2, \z0, \ref

-    vsubshs \t1, \t1, \t2

-    vsum4shs \sum, \t1, \sum

-    ;# Now compute sse.

-    vsububs \t1, \src, \ref

-    vsububs \t2, \ref, \src

-    vor     \t1, \t1, \t2

-    vmsumubm \sse, \t1, \t1, \sse

-.endm

-.macro variance_final sum, sse, z0, DS

-    vsumsws \sum, \sum, \z0

-    vsumsws \sse, \sse, \z0

-    stvx    \sum, 0, r1

-    lwz     r3, 12(r1)

-    stvx    \sse, 0, r1

-    lwz     r4, 12(r1)

-    stw     r4, 0(r9)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)

-.endm

-.macro compute_sum_sse_16 V, increment_counter

-    load_and_align_16  v16, r7, r8, \increment_counter

-    compute_sum_sse \V, v16, v18, v19, v20, v21, v23

-.endm

-.macro load_and_align_16 V, R, P, increment_counter

-    lvsl    v17,  0, \R         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, \R

-    lvx     v22, r10, \R

-.if \increment_counter

-    add     \R, \R, \P

-.endif

-    vperm   \V, v21, v22, v17

-.endm

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance4x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf830

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_4x4_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r12, r0

-    load_c v11, b_4567_b, 0, r12, r0

-    hfilter_8 v0, v10, v11, 1

-    hfilter_8 v1, v10, v11, 1

-    hfilter_8 v2, v10, v11, 1

-    hfilter_8 v3, v10, v11, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_4x4_b

-    hfilter_8 v4, v10, v11, 0

-    b   second_pass_4x4_b

-second_pass_4x4_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16 v0, r3, r4, 1

-    load_and_align_16 v1, r3, r4, 1

-    load_and_align_16 v2, r3, r4, 1

-    load_and_align_16 v3, r3, r4, 1

-    load_and_align_16 v4, r3, r4, 0

-second_pass_4x4_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-compute_sum_sse_4x4_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    load_and_align_16 v4, r7, r8, 1

-    load_and_align_16 v5, r7, r8, 1

-    load_and_align_16 v6, r7, r8, 1

-    load_and_align_16 v7, r7, r8, 1

-    vmrghb  v0, v0, v1

-    vmrghb  v1, v2, v3

-    vmrghb  v2, v4, v5

-    vmrghb  v3, v6, v7

-    load_c v10, b_hilo_b, 0, r12, r0

-    vperm   v0, v0, v1, v10

-    vperm   v1, v2, v3, v10

-    compute_sum_sse v0, v1, v18, v19, v20, v21, v23

-    variance_final v18, v19, v23, 4

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance8x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff0

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x8_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r12, r0

-    load_c v11, b_4567_b, 0, r12, r0

-    hfilter_8 v0, v10, v11, 1

-    hfilter_8 v1, v10, v11, 1

-    hfilter_8 v2, v10, v11, 1

-    hfilter_8 v3, v10, v11, 1

-    hfilter_8 v4, v10, v11, 1

-    hfilter_8 v5, v10, v11, 1

-    hfilter_8 v6, v10, v11, 1

-    hfilter_8 v7, v10, v11, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_8x8_b

-    hfilter_8 v8, v10, v11, 0

-    b   second_pass_8x8_b

-second_pass_8x8_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16 v0, r3, r4, 1

-    load_and_align_16 v1, r3, r4, 1

-    load_and_align_16 v2, r3, r4, 1

-    load_and_align_16 v3, r3, r4, 1

-    load_and_align_16 v4, r3, r4, 1

-    load_and_align_16 v5, r3, r4, 1

-    load_and_align_16 v6, r3, r4, 1

-    load_and_align_16 v7, r3, r4, 1

-    load_and_align_16 v8, r3, r4, 0

-    beq     compute_sum_sse_8x8_b

-second_pass_8x8_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0, v1

-    vfilter_16 v1, v2

-    vfilter_16 v2, v3

-    vfilter_16 v3, v4

-    vfilter_16 v4, v5

-    vfilter_16 v5, v6

-    vfilter_16 v6, v7

-    vfilter_16 v7, v8

-compute_sum_sse_8x8_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    vmrghb  v0, v0, v1

-    vmrghb  v1, v2, v3

-    vmrghb  v2, v4, v5

-    vmrghb  v3, v6, v7

-    load_and_align_16 v4,  r7, r8, 1

-    load_and_align_16 v5,  r7, r8, 1

-    load_and_align_16 v6,  r7, r8, 1

-    load_and_align_16 v7,  r7, r8, 1

-    load_and_align_16 v8,  r7, r8, 1

-    load_and_align_16 v9,  r7, r8, 1

-    load_and_align_16 v10, r7, r8, 1

-    load_and_align_16 v11, r7, r8, 0

-    vmrghb  v4, v4,  v5

-    vmrghb  v5, v6,  v7

-    vmrghb  v6, v8,  v9

-    vmrghb  v7, v10, v11

-    compute_sum_sse v0, v4, v18, v19, v20, v21, v23

-    compute_sum_sse v1, v5, v18, v19, v20, v21, v23

-    compute_sum_sse v2, v6, v18, v19, v20, v21, v23

-    compute_sum_sse v3, v7, v18, v19, v20, v21, v23

-    variance_final v18, v19, v23, 6

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance8x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfffc

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x16_pre_copy_b

-    ;# Load up permutation constants

-    load_c v29, b_0123_b, 0, r12, r0

-    load_c v30, b_4567_b, 0, r12, r0

-    hfilter_8 v0,  v29, v30, 1

-    hfilter_8 v1,  v29, v30, 1

-    hfilter_8 v2,  v29, v30, 1

-    hfilter_8 v3,  v29, v30, 1

-    hfilter_8 v4,  v29, v30, 1

-    hfilter_8 v5,  v29, v30, 1

-    hfilter_8 v6,  v29, v30, 1

-    hfilter_8 v7,  v29, v30, 1

-    hfilter_8 v8,  v29, v30, 1

-    hfilter_8 v9,  v29, v30, 1

-    hfilter_8 v10, v29, v30, 1

-    hfilter_8 v11, v29, v30, 1

-    hfilter_8 v12, v29, v30, 1

-    hfilter_8 v13, v29, v30, 1

-    hfilter_8 v14, v29, v30, 1

-    hfilter_8 v15, v29, v30, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_8x16_b

-    hfilter_8 v16, v29, v30, 0

-    b   second_pass_8x16_b

-second_pass_8x16_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16 v0,  r3, r4, 1

-    load_and_align_16 v1,  r3, r4, 1

-    load_and_align_16 v2,  r3, r4, 1

-    load_and_align_16 v3,  r3, r4, 1

-    load_and_align_16 v4,  r3, r4, 1

-    load_and_align_16 v5,  r3, r4, 1

-    load_and_align_16 v6,  r3, r4, 1

-    load_and_align_16 v7,  r3, r4, 1

-    load_and_align_16 v8,  r3, r4, 1

-    load_and_align_16 v9,  r3, r4, 1

-    load_and_align_16 v10, r3, r4, 1

-    load_and_align_16 v11, r3, r4, 1

-    load_and_align_16 v12, r3, r4, 1

-    load_and_align_16 v13, r3, r4, 1

-    load_and_align_16 v14, r3, r4, 1

-    load_and_align_16 v15, r3, r4, 1

-    load_and_align_16 v16, r3, r4, 0

-    beq     compute_sum_sse_8x16_b

-second_pass_8x16_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-    vfilter_16 v8,  v9

-    vfilter_16 v9,  v10

-    vfilter_16 v10, v11

-    vfilter_16 v11, v12

-    vfilter_16 v12, v13

-    vfilter_16 v13, v14

-    vfilter_16 v14, v15

-    vfilter_16 v15, v16

-compute_sum_sse_8x16_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    vmrghb  v0, v0,  v1

-    vmrghb  v1, v2,  v3

-    vmrghb  v2, v4,  v5

-    vmrghb  v3, v6,  v7

-    vmrghb  v4, v8,  v9

-    vmrghb  v5, v10, v11

-    vmrghb  v6, v12, v13

-    vmrghb  v7, v14, v15

-    load_and_align_16 v8,  r7, r8, 1

-    load_and_align_16 v9,  r7, r8, 1

-    load_and_align_16 v10, r7, r8, 1

-    load_and_align_16 v11, r7, r8, 1

-    load_and_align_16 v12, r7, r8, 1

-    load_and_align_16 v13, r7, r8, 1

-    load_and_align_16 v14, r7, r8, 1

-    load_and_align_16 v15, r7, r8, 1

-    vmrghb  v8,  v8,  v9

-    vmrghb  v9,  v10, v11

-    vmrghb  v10, v12, v13

-    vmrghb  v11, v14, v15

-    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23

-    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23

-    compute_sum_sse v2, v10, v18, v19, v20, v21, v23

-    compute_sum_sse v3, v11, v18, v19, v20, v21, v23

-    load_and_align_16 v8,  r7, r8, 1

-    load_and_align_16 v9,  r7, r8, 1

-    load_and_align_16 v10, r7, r8, 1

-    load_and_align_16 v11, r7, r8, 1

-    load_and_align_16 v12, r7, r8, 1

-    load_and_align_16 v13, r7, r8, 1

-    load_and_align_16 v14, r7, r8, 1

-    load_and_align_16 v15, r7, r8, 0

-    vmrghb  v8,  v8,  v9

-    vmrghb  v9,  v10, v11

-    vmrghb  v10, v12, v13

-    vmrghb  v11, v14, v15

-    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23

-    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23

-    compute_sum_sse v6, v10, v18, v19, v20, v21, v23

-    compute_sum_sse v7, v11, v18, v19, v20, v21, v23

-    variance_final v18, v19, v23, 7

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro hfilter_16 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-    lvx     v23, r12, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified

-    ;# set 0

-    vmsummbm v24, v20, v21, v18 ;# taps times elements

-    ;# set 1

-    vsldoi  v23, v21, v22, 1

-    vmsummbm v25, v20, v23, v18

-    ;# set 2

-    vsldoi  v23, v21, v22, 2

-    vmsummbm v26, v20, v23, v18

-    ;# set 3

-    vsldoi  v23, v21, v22, 3

-    vmsummbm v27, v20, v23, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vsrh    v25, v25, v19

-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result

-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result

-.endm

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance16x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    HProlog second_pass_16x8_pre_copy_b

-    hfilter_16 v0, 1

-    hfilter_16 v1, 1

-    hfilter_16 v2, 1

-    hfilter_16 v3, 1

-    hfilter_16 v4, 1

-    hfilter_16 v5, 1

-    hfilter_16 v6, 1

-    hfilter_16 v7, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_16x8_b

-    hfilter_16 v8, 0

-    b   second_pass_16x8_b

-second_pass_16x8_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16  v0,  r3, r4, 1

-    load_and_align_16  v1,  r3, r4, 1

-    load_and_align_16  v2,  r3, r4, 1

-    load_and_align_16  v3,  r3, r4, 1

-    load_and_align_16  v4,  r3, r4, 1

-    load_and_align_16  v5,  r3, r4, 1

-    load_and_align_16  v6,  r3, r4, 1

-    load_and_align_16  v7,  r3, r4, 1

-    load_and_align_16  v8,  r3, r4, 1

-    beq     compute_sum_sse_16x8_b

-second_pass_16x8_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-compute_sum_sse_16x8_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    compute_sum_sse_16 v0, 1

-    compute_sum_sse_16 v1, 1

-    compute_sum_sse_16 v2, 1

-    compute_sum_sse_16 v3, 1

-    compute_sum_sse_16 v4, 1

-    compute_sum_sse_16 v5, 1

-    compute_sum_sse_16 v6, 1

-    compute_sum_sse_16 v7, 0

-    variance_final v18, v19, v23, 7

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    HProlog second_pass_16x16_pre_copy_b

-    hfilter_16 v0,  1

-    hfilter_16 v1,  1

-    hfilter_16 v2,  1

-    hfilter_16 v3,  1

-    hfilter_16 v4,  1

-    hfilter_16 v5,  1

-    hfilter_16 v6,  1

-    hfilter_16 v7,  1

-    hfilter_16 v8,  1

-    hfilter_16 v9,  1

-    hfilter_16 v10, 1

-    hfilter_16 v11, 1

-    hfilter_16 v12, 1

-    hfilter_16 v13, 1

-    hfilter_16 v14, 1

-    hfilter_16 v15, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_16x16_b

-    hfilter_16 v16, 0

-    b   second_pass_16x16_b

-second_pass_16x16_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16  v0,  r3, r4, 1

-    load_and_align_16  v1,  r3, r4, 1

-    load_and_align_16  v2,  r3, r4, 1

-    load_and_align_16  v3,  r3, r4, 1

-    load_and_align_16  v4,  r3, r4, 1

-    load_and_align_16  v5,  r3, r4, 1

-    load_and_align_16  v6,  r3, r4, 1

-    load_and_align_16  v7,  r3, r4, 1

-    load_and_align_16  v8,  r3, r4, 1

-    load_and_align_16  v9,  r3, r4, 1

-    load_and_align_16  v10, r3, r4, 1

-    load_and_align_16  v11, r3, r4, 1

-    load_and_align_16  v12, r3, r4, 1

-    load_and_align_16  v13, r3, r4, 1

-    load_and_align_16  v14, r3, r4, 1

-    load_and_align_16  v15, r3, r4, 1

-    load_and_align_16  v16, r3, r4, 0

-    beq     compute_sum_sse_16x16_b

-second_pass_16x16_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-    vfilter_16 v8,  v9

-    vfilter_16 v9,  v10

-    vfilter_16 v10, v11

-    vfilter_16 v11, v12

-    vfilter_16 v12, v13

-    vfilter_16 v13, v14

-    vfilter_16 v14, v15

-    vfilter_16 v15, v16

-compute_sum_sse_16x16_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    compute_sum_sse_16 v0,  1

-    compute_sum_sse_16 v1,  1

-    compute_sum_sse_16 v2,  1

-    compute_sum_sse_16 v3,  1

-    compute_sum_sse_16 v4,  1

-    compute_sum_sse_16 v5,  1

-    compute_sum_sse_16 v6,  1

-    compute_sum_sse_16 v7,  1

-    compute_sum_sse_16 v8,  1

-    compute_sum_sse_16 v9,  1

-    compute_sum_sse_16 v10, 1

-    compute_sum_sse_16 v11, 1

-    compute_sum_sse_16 v12, 1

-    compute_sum_sse_16 v13, 1

-    compute_sum_sse_16 v14, 1

-    compute_sum_sse_16 v15, 0

-    variance_final v18, v19, v23, 8

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-    .align 4

-hfilter_b:

-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0

-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0

-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0

-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0

-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0

-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0

-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0

-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0

-    .align 4

-vfilter_b:

-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128

-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .align 4

-b_hperm_b:

-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

-    .align 4

-b_0123_b:

-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

-    .align 4

-b_4567_b:

-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

-b_hilo_b:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- a/vp9/encoder/vp9_asm_enc_offsets.c

+++ b/vp9/encoder/vp9_asm_enc_offsets.c

@@ -10,31 +10,8 @@

 #include "vpx_ports/asm_offsets.h"

-#include "vpx_config.h"

-#include "vp9/encoder/vp9_block.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/encoder/vp9_onyx_int.h"

-#include "vp9/encoder/vp9_treewriter.h"

-#include "vp9/encoder/vp9_tokenize.h"

 BEGIN

-/* regular quantize */

-DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));

-DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));

-DEFINE(vp9_block_round,                         offsetof(BLOCK, round));

-DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));

-DEFINE(vp9_block_quant_fast,                    offsetof(BLOCK, quant_fast));

-DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));

-DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));

-DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));

-DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));

-DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));

-DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));

END

-/* add asserts for any offset that is not supported by assembly code

- * add asserts for any size that is not supported by assembly code

- */

--- a/vp9/encoder/vp9_bitstream.c

+++ b/vp9/encoder/vp9_bitstream.c

@@ -8,354 +8,300 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "vp9/common/vp9_header.h"

-#include "vp9/encoder/vp9_encodemv.h"

-#include "vp9/common/vp9_entropymode.h"

-#include "vp9/common/vp9_entropymv.h"

-#include "vp9/common/vp9_findnearmv.h"

-#include "vp9/common/vp9_tile_common.h"

-#include "vp9/encoder/vp9_mcomp.h"

-#include "vp9/common/vp9_systemdependent.h"

 #include <assert.h>

 #include <stdio.h>

 #include <limits.h>

-#include "vp9/common/vp9_pragmas.h"

 #include "vpx/vpx_encoder.h"

 #include "vpx_mem/vpx_mem.h"

-#include "vp9/encoder/vp9_bitstream.h"

-#include "vp9/encoder/vp9_segmentation.h"

+#include "vp9/common/vp9_entropymode.h"

+#include "vp9/common/vp9_entropymv.h"

+#include "vp9/common/vp9_findnearmv.h"

+#include "vp9/common/vp9_tile_common.h"

 #include "vp9/common/vp9_seg_common.h"

 #include "vp9/common/vp9_pred_common.h"

 #include "vp9/common/vp9_entropy.h"

-#include "vp9/encoder/vp9_encodemv.h"

 #include "vp9/common/vp9_entropymv.h"

 #include "vp9/common/vp9_mvref_common.h"

 #include "vp9/common/vp9_treecoder.h"

+#include "vp9/common/vp9_systemdependent.h"

+#include "vp9/common/vp9_pragmas.h"

+#include "vp9/encoder/vp9_mcomp.h"

+#include "vp9/encoder/vp9_encodemv.h"

+#include "vp9/encoder/vp9_bitstream.h"

+#include "vp9/encoder/vp9_segmentation.h"

+#include "vp9/encoder/vp9_write_bit_buffer.h"

 #if defined(SECTIONBITS_OUTPUT)

 unsigned __int64 Sectionbits[500];

 #endif

 #ifdef ENTROPY_STATS

-int intra_mode_stats[VP9_KF_BINTRAMODES]

-                    [VP9_KF_BINTRAMODES]

-                    [VP9_KF_BINTRAMODES];

-vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];

-vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];

-vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];

-vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];

+int intra_mode_stats[VP9_INTRA_MODES]

+                    [VP9_INTRA_MODES]

+                    [VP9_INTRA_MODES];

+vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];

 extern unsigned int active_section;

 #endif

-#if CONFIG_CODE_NONZEROCOUNT

-#ifdef NZC_STATS

-unsigned int nzc_stats_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                          [NZC4X4_TOKENS];

-unsigned int nzc_stats_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                          [NZC8X8_TOKENS];

-unsigned int nzc_stats_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                          [NZC16X16_TOKENS];

-unsigned int nzc_stats_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                          [NZC32X32_TOKENS];

-unsigned int nzc_pcat_stats[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]

-                          [NZC_BITS_EXTRA][2];

-void init_nzcstats();

-void update_nzcstats(VP9_COMMON *const cm);

-void print_nzcstats();

-#endif

-#endif

-#ifdef MODE_STATS

-int count_mb_seg[4] = { 0, 0, 0, 0 };

-#endif

 #define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)

 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))

-#define SEARCH_NEWP

-static int update_bits[255];

+#ifdef MODE_STATS

+int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];

+int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];

+int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];

+int64_t switchable_interp_stats[VP9_SWITCHABLE_FILTERS+1]

+                               [VP9_SWITCHABLE_FILTERS];

-static void compute_update_table() {

-  int i;

-  for (i = 0; i < 255; i++)

-    update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);

+void init_tx_count_stats() {

+  vp9_zero(tx_count_32x32p_stats);

+  vp9_zero(tx_count_16x16p_stats);

+  vp9_zero(tx_count_8x8p_stats);

-static int split_index(int i, int n, int modulus) {

-  int max1 = (n - 1 - modulus / 2) / modulus + 1;

-  if (i % modulus == modulus / 2) i = i / modulus;

-  else i = max1 + i - (i + modulus - modulus / 2) / modulus;

-  return i;

+void init_switchable_interp_stats() {

+  vp9_zero(switchable_interp_stats);

-static int remap_prob(int v, int m) {

-  const int n = 256;

-  const int modulus = MODULUS_PARAM;

-  int i;

-  if ((m << 1) <= n)

-    i = vp9_recenter_nonneg(v, m) - 1;

-  else

-    i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;

-  i = split_index(i, n - 1, modulus);

-  return i;

+static void update_tx_count_stats(VP9_COMMON *cm) {

+  int i, j;

+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+    for (j = 0; j < TX_SIZE_MAX_SB; j++) {

+      tx_count_32x32p_stats[i][j] += cm->fc.tx_count_32x32p[i][j];

+    }

+  }

+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+    for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {

+      tx_count_16x16p_stats[i][j] += cm->fc.tx_count_16x16p[i][j];

+    }

+  }

+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+    for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {

+      tx_count_8x8p_stats[i][j] += cm->fc.tx_count_8x8p[i][j];

+    }

+  }

-static void write_prob_diff_update(vp9_writer *const bc,

-                                   vp9_prob newp, vp9_prob oldp) {

-  int delp = remap_prob(newp, oldp);

-  vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);

+static void update_switchable_interp_stats(VP9_COMMON *cm) {

+  int i, j;

+  for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; ++i)

+    for (j = 0; j < VP9_SWITCHABLE_FILTERS; ++j) {

+      switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];

+    }

-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {

-  int delp = remap_prob(newp, oldp);

-  return update_bits[delp] * 256;

+void write_tx_count_stats() {

+  int i, j;

+  FILE *fp = fopen("tx_count.bin", "wb");

+  fwrite(tx_count_32x32p_stats, sizeof(tx_count_32x32p_stats), 1, fp);

+  fwrite(tx_count_16x16p_stats, sizeof(tx_count_16x16p_stats), 1, fp);

+  fwrite(tx_count_8x8p_stats, sizeof(tx_count_8x8p_stats), 1, fp);

+  fclose(fp);

+  printf(

+      "vp9_default_tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB] = {\n");

+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+    printf("  { ");

+    for (j = 0; j < TX_SIZE_MAX_SB; j++) {

+      printf("%"PRId64", ", tx_count_32x32p_stats[i][j]);

+    }

+    printf("},\n");

+  }

+  printf("};\n");

+  printf(

+      "vp9_default_tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB-1] = {\n");

+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+    printf("  { ");

+    for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {

+      printf("%"PRId64", ", tx_count_16x16p_stats[i][j]);

+    }

+    printf("},\n");

+  }

+  printf("};\n");

+  printf(

+      "vp9_default_tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB-2] = {\n");

+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+    printf("  { ");

+    for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {

+      printf("%"PRId64", ", tx_count_8x8p_stats[i][j]);

+    }

+    printf("},\n");

+  }

+  printf("};\n");

-static void update_mode(

-  vp9_writer *const bc,

-  int n,

-  vp9_token tok               [/* n */],

-  vp9_tree tree,

-  vp9_prob Pnew               [/* n-1 */],

-  vp9_prob Pcur               [/* n-1 */],

-  unsigned int bct            [/* n-1 */] [2],

-  const unsigned int num_events[/* n */]

-) {

-  unsigned int new_b = 0, old_b = 0;

-  int i = 0;

+void write_switchable_interp_stats() {

+  int i, j;

+  FILE *fp = fopen("switchable_interp.bin", "wb");

+  fwrite(switchable_interp_stats, sizeof(switchable_interp_stats), 1, fp);

+  fclose(fp);

-  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);

-  n--;

+  printf(

+      "vp9_default_switchable_filter_count[VP9_SWITCHABLE_FILTERS+1]"

+      "[VP9_SWITCHABLE_FILTERS] = {\n");

+  for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; i++) {

+    printf("  { ");

+    for (j = 0; j < VP9_SWITCHABLE_FILTERS; j++) {

+      printf("%"PRId64", ", switchable_interp_stats[i][j]);

+    }

+    printf("},\n");

+  }

+  printf("};\n");

+}

+#endif

-  do {

-    new_b += cost_branch(bct[i], Pnew[i]);

-    old_b += cost_branch(bct[i], Pcur[i]);

-  } while (++i < n);

+static int update_bits[255];

-  if (new_b + (n << 8) < old_b) {

-    int i = 0;

+static INLINE void write_be32(uint8_t *p, int value) {

+  p[0] = value >> 24;

+  p[1] = value >> 16;

+  p[2] = value >> 8;

+  p[3] = value;

+}

-    vp9_write_bit(bc, 1);

-    do {

-      const vp9_prob p = Pnew[i];

-      vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);

-    } while (++i < n);

-  } else

-    vp9_write_bit(bc, 0);

+int recenter_nonneg(int v, int m) {

+  if (v > (m << 1))

+    return v;

+  else if (v >= m)

+    return ((v - m) << 1);

+  else

+    return ((m - v) << 1) - 1;

-static void update_mbintra_mode_probs(VP9_COMP* const cpi,

-                                      vp9_writer* const bc) {

-  VP9_COMMON *const cm = &cpi->common;

-  {

-    vp9_prob Pnew   [VP9_YMODES - 1];

-    unsigned int bct [VP9_YMODES - 1] [2];

-    update_mode(

-      bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,

-      Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count

-    );

-    update_mode(bc, VP9_I32X32_MODES, vp9_sb_ymode_encodings,

-                vp9_sb_ymode_tree, Pnew, cm->fc.sb_ymode_prob, bct,

-                (unsigned int *)cpi->sb_ymode_count);

+static int get_unsigned_bits(unsigned num_values) {

+  int cat = 0;

+  if ((num_values--) <= 1) return 0;

+  while (num_values > 0) {

+    cat++;

+    num_values >>= 1;

+  return cat;

-void vp9_update_skip_probs(VP9_COMP *cpi) {

-  VP9_COMMON *const pc = &cpi->common;

-  int k;

-  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

-    pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],

-                                               cpi->skip_true_count[k]);

-  }

+void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,

+                             int data, int max) {

+  vp9_wb_write_literal(wb, data, get_unsigned_bits(max));

-static void update_switchable_interp_probs(VP9_COMP *cpi,

-                                           vp9_writer* const bc) {

-  VP9_COMMON *const pc = &cpi->common;

-  unsigned int branch_ct[32][2];

-  int i, j;

-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

-    vp9_tree_probs_from_distribution(

-        vp9_switchable_interp_tree,

-        pc->fc.switchable_interp_prob[j], branch_ct,

-        cpi->switchable_interp_count[j], 0);

-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

-      if (pc->fc.switchable_interp_prob[j][i] < 1)

-        pc->fc.switchable_interp_prob[j][i] = 1;

-      vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);

-    }

+void encode_uniform(vp9_writer *w, int v, int n) {

+  int l = get_unsigned_bits(n);

+  int m;

+  if (l == 0)

+    return;

+  m = (1 << l) - n;

+  if (v < m) {

+    vp9_write_literal(w, v, l - 1);

+  } else {

+    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);

+    vp9_write_literal(w, (v - m) & 1, 1);

-// This function updates the reference frame prediction stats

-static void update_refpred_stats(VP9_COMP *cpi) {

-  VP9_COMMON *const cm = &cpi->common;

-  int i;

-  vp9_prob new_pred_probs[PREDICTION_PROBS];

-  int old_cost, new_cost;

-  // Set the prediction probability structures to defaults

-  if (cm->frame_type != KEY_FRAME) {

-    // From the prediction counts set the probabilities for each context

-    for (i = 0; i < PREDICTION_PROBS; i++) {

-      new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],

-                                          cpi->ref_pred_count[i][1]);

-      // Decide whether or not to update the reference frame probs.

-      // Returned costs are in 1/256 bit units.

-      old_cost =

-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +

-        (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));

-      new_cost =

-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +

-        (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));

-      // Cost saving must be >= 8 bits (2048 in these units)

-      if ((old_cost - new_cost) >= 2048) {

-        cpi->ref_pred_probs_update[i] = 1;

-        cm->ref_pred_probs[i] = new_pred_probs[i];

-      } else

-        cpi->ref_pred_probs_update[i] = 0;

-    }

-  }

+int count_uniform(int v, int n) {

+  int l = get_unsigned_bits(n);

+  int m;

+  if (l == 0) return 0;

+  m = (1 << l) - n;

+  if (v < m)

+    return l - 1;

+  else

+    return l;

-// This function is called to update the mode probability context used to encode

-// inter modes. It assumes the branch counts table has already been populated

-// prior to the actual packing of the bitstream (in rd stage or dummy pack)

-//

-// The branch counts table is re-populated during the actual pack stage and in

-// the decoder to facilitate backwards update of the context.

-static void update_inter_mode_probs(VP9_COMMON *cm,

-                                    int mode_context[INTER_MODE_CONTEXTS][4]) {

-  int i, j;

-  unsigned int (*mv_ref_ct)[4][2];

-  vpx_memcpy(mode_context, cm->fc.vp9_mode_contexts,

-             sizeof(cm->fc.vp9_mode_contexts));

-  mv_ref_ct = cm->fc.mv_ref_ct;

-  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {

-    for (j = 0; j < 4; j++) {

-      int new_prob, old_cost, new_cost;

-      // Work out cost of coding branches with the old and optimal probability

-      old_cost = cost_branch256(mv_ref_ct[i][j], mode_context[i][j]);

-      new_prob = get_binary_prob(mv_ref_ct[i][j][0], mv_ref_ct[i][j][1]);

-      new_cost = cost_branch256(mv_ref_ct[i][j], new_prob);

-      // If cost saving is >= 14 bits then update the mode probability.

-      // This is the approximate net cost of updating one probability given

-      // that the no update case ismuch more common than the update case.

-      if (new_cost <= (old_cost - (14 << 8))) {

-        mode_context[i][j] = new_prob;

+void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {

+  int i = 0;

+  int mk = 0;

+  while (1) {

+    int b = (i ? k + i - 1 : k);

+    int a = (1 << b);

+    if (num_syms <= mk + 3 * a) {

+      encode_uniform(w, word - mk, num_syms - mk);

+      break;

+    } else {

+      int t = (word >= mk + a);

+      vp9_write_literal(w, t, 1);

+      if (t) {

+        i = i + 1;

+        mk += a;

+      } else {

+        vp9_write_literal(w, word - mk, b);

+        break;

-#if CONFIG_NEW_MVREF

-static void update_mv_ref_probs(VP9_COMP *cpi,

-                                int mvref_probs[MAX_REF_FRAMES]

-                                               [MAX_MV_REF_CANDIDATES-1]) {

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  int rf;     // Reference frame

-  int ref_c;  // Motion reference candidate

-  int node;   // Probability node index

-  for (rf = 0; rf < MAX_REF_FRAMES; ++rf) {

-    int count = 0;

-    // Skip the dummy entry for intra ref frame.

-    if (rf == INTRA_FRAME) {

-      continue;

-    }

-    // Sum the counts for all candidates

-    for (ref_c = 0; ref_c < MAX_MV_REF_CANDIDATES; ++ref_c) {

-      count += cpi->mb_mv_ref_count[rf][ref_c];

-    }

-    // Calculate the tree node probabilities

-    for (node = 0; node < MAX_MV_REF_CANDIDATES-1; ++node) {

-      int new_prob, old_cost, new_cost;

-      unsigned int branch_cnts[2];

-      // How many hits on each branch at this node

-      branch_cnts[0] = cpi->mb_mv_ref_count[rf][node];

-      branch_cnts[1] = count - cpi->mb_mv_ref_count[rf][node];

-      // Work out cost of coding branches with the old and optimal probability

-      old_cost = cost_branch256(branch_cnts, xd->mb_mv_ref_probs[rf][node]);

-      new_prob = get_prob(branch_cnts[0], count);

-      new_cost = cost_branch256(branch_cnts, new_prob);

-      // Take current 0 branch cases out of residual count

-      count -= cpi->mb_mv_ref_count[rf][node];

-      if ((new_cost + VP9_MV_REF_UPDATE_COST) <= old_cost) {

-        mvref_probs[rf][node] = new_prob;

+int count_term_subexp(int word, int k, int num_syms) {

+  int count = 0;

+  int i = 0;

+  int mk = 0;

+  while (1) {

+    int b = (i ? k + i - 1 : k);

+    int a = (1 << b);

+    if (num_syms <= mk + 3 * a) {

+      count += count_uniform(word - mk, num_syms - mk);

+      break;

+    } else {

+      int t = (word >= mk + a);

+      count++;

+      if (t) {

+        i = i + 1;

+        mk += a;

       } else {

-        mvref_probs[rf][node] = xd->mb_mv_ref_probs[rf][node];

+        count += b;

+        break;

+  return count;

-#endif

-static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);

+static void compute_update_table() {

+  int i;

+  for (i = 0; i < 254; i++)

+    update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);

-static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);

+static int split_index(int i, int n, int modulus) {

+  int max1 = (n - 1 - modulus / 2) / modulus + 1;

+  if (i % modulus == modulus / 2) i = i / modulus;

+  else i = max1 + i - (i + modulus - modulus / 2) / modulus;

+  return i;

-static void write_sb_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_sb_ymode_tree, p, vp9_sb_ymode_encodings + m);

-}

+static int remap_prob(int v, int m) {

+  const int n = 255;

+  const int modulus = MODULUS_PARAM;

+  int i;

+  v--;

+  m--;

+  if ((m << 1) <= n)

+    i = recenter_nonneg(v, m) - 1;

+  else

+    i = recenter_nonneg(n - 1 - v, n - 1 - m) - 1;

-static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);

+  i = split_index(i, n - 1, modulus);

+  return i;

-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);

+static void write_prob_diff_update(vp9_writer *w,

+                                   vp9_prob newp, vp9_prob oldp) {

+  int delp = remap_prob(newp, oldp);

+  encode_term_subexp(w, delp, SUBEXP_PARAM, 255);

-static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);

+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {

+  int delp = remap_prob(newp, oldp);

+  return update_bits[delp] * 256;

-static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {

-#if CONFIG_NEWBINTRAMODES

-  assert(m < B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS || m == B_CONTEXT_PRED);

-  if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;

-#endif

-  write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);

-}

-static void write_kf_bmode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_kf_bmode_tree, p, vp9_kf_bmode_encodings + m);

-}

-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {

-  write_token(

-    bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);

-}

 static int prob_update_savings(const unsigned int *ct,

                                const vp9_prob oldp, const vp9_prob newp,

                                const vp9_prob upd) {

@@ -362,19 +308,9 @@

   const int old_b = cost_branch256(ct, oldp);

   const int new_b = cost_branch256(ct, newp);

   const int update_b = 2048 + vp9_cost_upd256;

-  return (old_b - new_b - update_b);

+  return old_b - new_b - update_b;

-static int prob_diff_update_savings(const unsigned int *ct,

-                                    const vp9_prob oldp, const vp9_prob newp,

-                                    const vp9_prob upd) {

-  const int old_b = cost_branch256(ct, oldp);

-  const int new_b = cost_branch256(ct, newp);

-  const int update_b = (newp == oldp ? 0 :

-                        prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);

-  return (old_b - new_b - update_b);

-}

 static int prob_diff_update_savings_search(const unsigned int *ct,

                                            const vp9_prob oldp, vp9_prob *bestp,

                                            const vp9_prob upd) {

@@ -399,7 +335,6 @@

   return bestsavings;

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

 static int prob_diff_update_savings_search_model(const unsigned int *ct,

                                                  const vp9_prob *oldp,

                                                  vp9_prob *bestp,

@@ -407,23 +342,26 @@

                                                  int b, int r) {

   int i, old_b, new_b, update_b, savings, bestsavings, step;

   int newp;

-  vp9_prob bestnewp, newplist[ENTROPY_NODES];

-  for (i = UNCONSTRAINED_NODES - 1, old_b = 0; i < ENTROPY_NODES; ++i)

-    old_b += cost_branch256(ct + 2 * i, oldp[i]);

+  vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];

+  vp9_model_to_full_probs(oldp, oldplist);

+  vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);

+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)

+    old_b += cost_branch256(ct + 2 * i, oldplist[i]);

+  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);

   bestsavings = 0;

-  bestnewp = oldp[UNCONSTRAINED_NODES - 1];

+  bestnewp = oldp[PIVOT_NODE];

-  step = (*bestp > oldp[UNCONSTRAINED_NODES - 1] ? -1 : 1);

+  step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);

   newp = *bestp;

-  // newp = *bestp - step * (abs(*bestp - oldp[UNCONSTRAINED_NODES - 1]) >> 1);

-  for (; newp != oldp[UNCONSTRAINED_NODES - 1]; newp += step) {

+  for (; newp != oldp[PIVOT_NODE]; newp += step) {

     if (newp < 1 || newp > 255) continue;

-    newplist[UNCONSTRAINED_NODES - 1] = newp;

-    vp9_get_model_distribution(newp, newplist, b, r);

-    for (i = UNCONSTRAINED_NODES - 1, new_b = 0; i < ENTROPY_NODES; ++i)

+    newplist[PIVOT_NODE] = newp;

+    vp9_model_to_full_probs(newplist, newplist);

+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)

       new_b += cost_branch256(ct + 2 * i, newplist[i]);

-    update_b = prob_diff_update_cost(newp, oldp[UNCONSTRAINED_NODES - 1]) +

+    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);

+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +

         vp9_cost_upd256;

     savings = old_b - new_b - update_b;

     if (savings > bestsavings) {

@@ -434,7 +372,6 @@

   *bestp = bestnewp;

   return bestsavings;

-#endif

 static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd,

                                  unsigned int *ct) {

@@ -441,10 +378,11 @@

   vp9_prob newp;

   int savings;

   newp = get_binary_prob(ct[0], ct[1]);

+  assert(newp >= 1);

   savings = prob_update_savings(ct, *oldp, newp, upd);

   if (savings > 0) {

     vp9_write(bc, 1, upd);

-    vp9_write_literal(bc, newp, 8);

+    vp9_write_prob(bc, newp);

     *oldp = newp;

   } else {

     vp9_write(bc, 0, upd);

@@ -451,6 +389,108 @@

+static void vp9_cond_prob_diff_update(vp9_writer *bc, vp9_prob *oldp,

+                                      vp9_prob upd,

+                                      unsigned int *ct) {

+  vp9_prob newp;

+  int savings;

+  newp = get_binary_prob(ct[0], ct[1]);

+  assert(newp >= 1);

+  savings = prob_diff_update_savings_search(ct, *oldp, &newp, upd);

+  if (savings > 0) {

+    vp9_write(bc, 1, upd);

+    write_prob_diff_update(bc, newp, *oldp);

+    *oldp = newp;

+  } else {

+    vp9_write(bc, 0, upd);

+  }

+}

+static void update_mode(

+  vp9_writer *w,

+  int n,

+  const struct vp9_token tok[/* n */],

+  vp9_tree tree,

+  vp9_prob Pnew[/* n-1 */],

+  vp9_prob Pcur[/* n-1 */],

+  unsigned int bct[/* n-1 */] [2],

+  const unsigned int num_events[/* n */]

+) {

+  int i = 0;

+  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);

+  n--;

+  for (i = 0; i < n; ++i) {

+    vp9_cond_prob_diff_update(w, &Pcur[i], VP9_MODE_UPDATE_PROB, bct[i]);

+  }

+}

+static void update_mbintra_mode_probs(VP9_COMP* const cpi,

+                                      vp9_writer* const bc) {

+  VP9_COMMON *const cm = &cpi->common;

+  int j;

+  vp9_prob pnew[VP9_INTRA_MODES - 1];

+  unsigned int bct[VP9_INTRA_MODES - 1][2];

+  for (j = 0; j < BLOCK_SIZE_GROUPS; j++)

+    update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_encodings,

+                vp9_intra_mode_tree, pnew,

+                cm->fc.y_mode_prob[j], bct,

+                (unsigned int *)cpi->y_mode_count[j]);

+}

+void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc) {

+  VP9_COMMON *const pc = &cpi->common;

+  int k;

+  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

+    vp9_cond_prob_diff_update(bc, &pc->fc.mbskip_probs[k],

+                              VP9_MODE_UPDATE_PROB, pc->fc.mbskip_count[k]);

+  }

+}

+static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {

+  write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);

+}

+static void update_switchable_interp_probs(VP9_COMP *const cpi,

+                                           vp9_writer* const bc) {

+  VP9_COMMON *const pc = &cpi->common;

+  unsigned int branch_ct[VP9_SWITCHABLE_FILTERS + 1]

+                        [VP9_SWITCHABLE_FILTERS - 1][2];

+  vp9_prob new_prob[VP9_SWITCHABLE_FILTERS + 1][VP9_SWITCHABLE_FILTERS - 1];

+  int i, j;

+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

+    vp9_tree_probs_from_distribution(

+        vp9_switchable_interp_tree,

+        new_prob[j], branch_ct[j],

+        pc->fc.switchable_interp_count[j], 0);

+  }

+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

+      vp9_cond_prob_diff_update(bc, &pc->fc.switchable_interp_prob[j][i],

+                                VP9_MODE_UPDATE_PROB, branch_ct[j][i]);

+    }

+  }

+#ifdef MODE_STATS

+  if (!cpi->dummy_packing)

+    update_switchable_interp_stats(pc);

+#endif

+}

+static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {

+  int i, j;

+  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {

+    for (j = 0; j < VP9_INTER_MODES - 1; j++) {

+      vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],

+                                VP9_MODE_UPDATE_PROB,

+                                pc->fc.inter_mode_counts[i][j]);

+    }

+  }

+}

 static void pack_mb_tokens(vp9_writer* const bc,

                            TOKENEXTRA **tp,

                            const TOKENEXTRA *const stop) {

@@ -457,50 +497,65 @@

   TOKENEXTRA *p = *tp;

   while (p < stop) {

-    const int t = p->Token;

-    vp9_token *const a = vp9_coef_encodings + t;

-    const vp9_extra_bit_struct *const b = vp9_extra_bits + t;

+    const int t = p->token;

+    const struct vp9_token *const a = vp9_coef_encodings + t;

+    const vp9_extra_bit *const b = vp9_extra_bits + t;

     int i = 0;

-    const unsigned char *pp = p->context_tree;

+    const vp9_prob *pp;

     int v = a->value;

-    int n = a->Len;

+    int n = a->len;

+    vp9_prob probs[ENTROPY_NODES];

-    if (t == EOSB_TOKEN)

-    {

+    if (t == EOSB_TOKEN) {

       ++p;

       break;

+    if (t >= TWO_TOKEN) {

+      vp9_model_to_full_probs(p->context_tree, probs);

+      pp = probs;

+    } else {

+      pp = p->context_tree;

+    }

+    assert(pp != 0);

     /* skip one or two nodes */

+#if !CONFIG_BALANCED_COEFTREE

     if (p->skip_eob_node) {

       n -= p->skip_eob_node;

       i = 2 * p->skip_eob_node;

+#endif

     do {

       const int bb = (v >> --n) & 1;

-      encode_bool(bc, bb, pp[i >> 1]);

+#if CONFIG_BALANCED_COEFTREE

+      if (i == 2 && p->skip_eob_node) {

+        i += 2;

+        assert(bb == 1);

+        continue;

+      }

+#endif

+      vp9_write(bc, bb, pp[i >> 1]);

       i = vp9_coef_tree[i + bb];

     } while (n);

     if (b->base_val) {

-      const int e = p->Extra, L = b->Len;

+      const int e = p->extra, l = b->len;

-      if (L) {

-        const unsigned char *pp = b->prob;

+      if (l) {

+        const unsigned char *pb = b->prob;

         int v = e >> 1;

-        int n = L;              /* number of bits in v, assumed nonzero */

+        int n = l;              /* number of bits in v, assumed nonzero */

         int i = 0;

         do {

           const int bb = (v >> --n) & 1;

-          encode_bool(bc, bb, pp[i >> 1]);

+          vp9_write(bc, bb, pb[i >> 1]);

           i = b->tree[i + bb];

         } while (n);

-      encode_bool(bc, e & 1, 128);

+      vp9_write_bit(bc, e & 1);

     ++p;

@@ -508,225 +563,60 @@

   *tp = p;

-static void write_partition_size(unsigned char *cx_data, int size) {

-  signed char csize;

-  csize = size & 0xff;

-  *cx_data = csize;

-  csize = (size >> 8) & 0xff;

-  *(cx_data + 1) = csize;

-  csize = (size >> 16) & 0xff;

-  *(cx_data + 2) = csize;

-}

-static void write_mv_ref

-(

-  vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p

-) {

-#if CONFIG_DEBUG

-  assert(NEARESTMV <= m  &&  m <= SPLITMV);

-#endif

-  write_token(bc, vp9_mv_ref_tree, p,

-              vp9_mv_ref_encoding_array - NEARESTMV + m);

-}

 static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,

                             const vp9_prob *p) {

 #if CONFIG_DEBUG

-  assert(NEARESTMV <= m  &&  m < SPLITMV);

+  assert(NEARESTMV <= m && m <= NEWMV);

 #endif

   write_token(bc, vp9_sb_mv_ref_tree, p,

               vp9_sb_mv_ref_encoding_array - NEARESTMV + m);

-static void write_sub_mv_ref

-(

-  vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p

-) {

-#if CONFIG_DEBUG

-  assert(LEFT4X4 <= m  &&  m <= NEW4X4);

-#endif

-  write_token(bc, vp9_sub_mv_ref_tree, p,

-              vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);

-}

-static void write_nmv(VP9_COMP *cpi, vp9_writer *bc,

-                      const MV *mv, const int_mv *ref,

-                      const nmv_context *nmvc, int usehp) {

-  MV e;

-  e.row = mv->row - ref->as_mv.row;

-  e.col = mv->col - ref->as_mv.col;

-  vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);

-  vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);

-}

-#if CONFIG_NEW_MVREF

-static void vp9_write_mv_ref_id(vp9_writer *w,

-                                vp9_prob * ref_id_probs,

-                                int mv_ref_id) {

-  // Encode the index for the MV reference.

-  switch (mv_ref_id) {

-    case 0:

-      vp9_write(w, 0, ref_id_probs[0]);

-      break;

-    case 1:

-      vp9_write(w, 1, ref_id_probs[0]);

-      vp9_write(w, 0, ref_id_probs[1]);

-      break;

-    case 2:

-      vp9_write(w, 1, ref_id_probs[0]);

-      vp9_write(w, 1, ref_id_probs[1]);

-      vp9_write(w, 0, ref_id_probs[2]);

-      break;

-    case 3:

-      vp9_write(w, 1, ref_id_probs[0]);

-      vp9_write(w, 1, ref_id_probs[1]);

-      vp9_write(w, 1, ref_id_probs[2]);

-      break;

-      // TRAP.. This should not happen

-    default:

-      assert(0);

-      break;

-  }

-}

-#endif

 // This function writes the current macro block's segnment id to the bitstream

 // It should only be called if a segment map update is indicated.

 static void write_mb_segid(vp9_writer *bc,

                            const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {

-  // Encode the MB segment id.

-  int seg_id = mi->segment_id;

-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

-    switch (seg_id) {

-      case 0:

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);

-        break;

-      case 1:

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);

-        break;

-      case 2:

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);

-        break;

-      case 3:

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);

-        break;

-        // TRAP.. This should not happen

-      default:

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);

-        break;

-    }

-  }

+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map)

+    treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs,

+                mi->segment_id, 3);

-static void write_mb_segid_except(VP9_COMMON *cm,

-                                  vp9_writer *bc,

-                                  const MB_MODE_INFO *mi,

-                                  const MACROBLOCKD *xd,

-                                  int mb_row, int mb_col) {

-  // Encode the MB segment id.

-  int seg_id = mi->segment_id;

-  int pred_seg_id = vp9_get_pred_mb_segid(cm, xd,

-                                          mb_row * cm->mb_cols + mb_col);

-  const vp9_prob *p = xd->mb_segment_tree_probs;

-  const vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id];

-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

-    vp9_write(bc, seg_id >= 2, p1);

-    if (pred_seg_id >= 2 && seg_id < 2) {

-      vp9_write(bc, seg_id == 1, p[1]);

-    } else if (pred_seg_id < 2 && seg_id >= 2) {

-      vp9_write(bc, seg_id == 3, p[2]);

-    }

-  }

-}

 // This function encodes the reference frame

-static void encode_ref_frame(vp9_writer *const bc,

-                             VP9_COMMON *const cm,

-                             MACROBLOCKD *xd,

-                             int segment_id,

-                             MV_REFERENCE_FRAME rf) {

-  int seg_ref_active;

-  int seg_ref_count = 0;

-  seg_ref_active = vp9_segfeature_active(xd,

-                                         segment_id,

-                                         SEG_LVL_REF_FRAME);

-  if (seg_ref_active) {

-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +

-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +

-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

-  }

+static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {

+  VP9_COMMON *const pc = &cpi->common;

+  MACROBLOCK *const x = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;

+  const int segment_id = mi->segment_id;

+  int seg_ref_active = vp9_segfeature_active(xd, segment_id,

+                                             SEG_LVL_REF_FRAME);

   // If segment level coding of this signal is disabled...

   // or the segment allows multiple reference frame options

-  if (!seg_ref_active || (seg_ref_count > 1)) {

-    // Values used in prediction model coding

-    unsigned char prediction_flag;

-    vp9_prob pred_prob;

-    MV_REFERENCE_FRAME pred_rf;

+  if (!seg_ref_active) {

+    // does the feature use compound prediction or not

+    // (if not specified at the frame/segment level)

+    if (pc->comp_pred_mode == HYBRID_PREDICTION) {

+      vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,

+                vp9_get_pred_prob(pc, xd, PRED_COMP_INTER_INTER));

+    } else {

+      assert((mi->ref_frame[1] <= INTRA_FRAME) ==

+                 (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));

+    }

-    // Get the context probability the prediction flag

-    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

-    // Get the predicted value.

-    pred_rf = vp9_get_pred_ref(cm, xd);

-    // Did the chosen reference frame match its predicted value.

-    prediction_flag =

-      (xd->mode_info_context->mbmi.ref_frame == pred_rf);

-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);

-    vp9_write(bc, prediction_flag, pred_prob);

-    // If not predicted correctly then code value explicitly

-    if (!prediction_flag) {

-      vp9_prob mod_refprobs[PREDICTION_PROBS];

-      vpx_memcpy(mod_refprobs,

-                 cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));

-      // If segment coding enabled blank out options that cant occur by

-      // setting the branch probability to 0.

-      if (seg_ref_active) {

-        mod_refprobs[INTRA_FRAME] *=

-          vp9_check_segref(xd, segment_id, INTRA_FRAME);

-        mod_refprobs[LAST_FRAME] *=

-          vp9_check_segref(xd, segment_id, LAST_FRAME);

-        mod_refprobs[GOLDEN_FRAME] *=

-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *

-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));

-      }

-      if (mod_refprobs[0]) {

-        vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);

-      }

-      // Inter coded

-      if (rf != INTRA_FRAME) {

-        if (mod_refprobs[1]) {

-          vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);

-        }

-        if (rf != LAST_FRAME) {

-          if (mod_refprobs[2]) {

-            vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);

-          }

-        }

-      }

+    if (mi->ref_frame[1] > INTRA_FRAME) {

+      vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,

+                vp9_get_pred_prob(pc, xd, PRED_COMP_REF_P));

+    } else {

+      vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,

+                vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P1));

+      if (mi->ref_frame[0] != LAST_FRAME)

+        vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,

+                  vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P2));

+  } else {

+    assert(mi->ref_frame[1] <= INTRA_FRAME);

+    assert(vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) ==

+           mi->ref_frame[0]);

   // if using the prediction mdoel we have nothing further to do because

@@ -733,51 +623,21 @@

   // the reference frame is fully coded by the segment

-// Update the probabilities used to encode reference frame data

-static void update_ref_probs(VP9_COMP *const cpi) {

-  VP9_COMMON *const cm = &cpi->common;

-  const int *const rfct = cpi->count_mb_ref_frame_usage;

-  const int rf_intra = rfct[INTRA_FRAME];

-  const int rf_inter = rfct[LAST_FRAME] +

-                       rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];

-  cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);

-  cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);

-  cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);

-  // Compute a modified set of probabilities to use when prediction of the

-  // reference frame fails

-  vp9_compute_mod_refprobs(cm);

-}

 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,

-                                vp9_writer *bc,

-                                int mb_rows_left, int mb_cols_left) {

+                                vp9_writer *bc, int mi_row, int mi_col) {

   VP9_COMMON *const pc = &cpi->common;

   const nmv_context *nmvc = &pc->fc.nmvc;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  const int mis = pc->mode_info_stride;

   MB_MODE_INFO *const mi = &m->mbmi;

-  const MV_REFERENCE_FRAME rf = mi->ref_frame;

+  const MV_REFERENCE_FRAME rf = mi->ref_frame[0];

   const MB_PREDICTION_MODE mode = mi->mode;

   const int segment_id = mi->segment_id;

-  const int mb_size = 1 << mi->sb_type;

   int skip_coeff;

-  int mb_row = pc->mb_rows - mb_rows_left;

-  int mb_col = pc->mb_cols - mb_cols_left;

   xd->prev_mode_info_context = pc->prev_mi + (m - pc->mi);

   x->partition_info = x->pi + (m - pc->mi);

-  // Distance of Mb to the various image edges.

-  // These specified to 8th pel as they are always compared to MV

-  // values that are in 1/8th pel units

-  set_mb_row(pc, xd, mb_row, mb_size);

-  set_mb_col(pc, xd, mb_col, mb_size);

 #ifdef ENTROPY_STATS

   active_section = 9;

 #endif

@@ -793,7 +653,7 @@

       // If the mb segment id wasn't predicted code explicitly

       if (!prediction_flag)

-        write_mb_segid_except(pc, bc, mi, &cpi->mb.e_mbd, mb_row, mb_col);

+        write_mb_segid(bc, mi, &cpi->mb.e_mbd);

     } else {

       // Normal unpredicted coding

       write_mb_segid(bc, mi, &cpi->mb.e_mbd);

@@ -800,9 +660,7 @@

-  if (!pc->mb_no_coeff_skip) {

-    skip_coeff = 0;

-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

     skip_coeff = 1;

   } else {

     skip_coeff = m->mbmi.mb_skip_coeff;

@@ -810,42 +668,50 @@

               vp9_get_pred_prob(pc, xd, PRED_MBSKIP));

-  // Encode the reference frame.

-  encode_ref_frame(bc, pc, xd, segment_id, rf);

+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))

+    vp9_write(bc, rf != INTRA_FRAME,

+              vp9_get_pred_prob(pc, xd, PRED_INTRA_INTER));

+  if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&

+      !(rf != INTRA_FRAME &&

+        (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

+    TX_SIZE sz = mi->txfm_size;

+    const vp9_prob *tx_probs = vp9_get_pred_probs(pc, xd, PRED_TX_SIZE);

+    vp9_write(bc, sz != TX_4X4, tx_probs[0]);

+    if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {

+      vp9_write(bc, sz != TX_8X8, tx_probs[1]);

+      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)

+        vp9_write(bc, sz != TX_16X16, tx_probs[2]);

+    }

+  }

   if (rf == INTRA_FRAME) {

 #ifdef ENTROPY_STATS

     active_section = 6;

 #endif

-    if (m->mbmi.sb_type)

-      write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);

-    else

-      write_ymode(bc, mode, pc->fc.ymode_prob);

-    if (mode == B_PRED) {

-      int j = 0;

-      do {

-        write_bmode(bc, m->bmi[j].as_mode.first,

-                    pc->fc.bmode_prob);

-      } while (++j < 16);

-    }

-    if (mode == I8X8_PRED) {

-      write_i8x8_mode(bc, m->bmi[0].as_mode.first,

-                      pc->fc.i8x8_mode_prob);

-      write_i8x8_mode(bc, m->bmi[2].as_mode.first,

-                      pc->fc.i8x8_mode_prob);

-      write_i8x8_mode(bc, m->bmi[8].as_mode.first,

-                      pc->fc.i8x8_mode_prob);

-      write_i8x8_mode(bc, m->bmi[10].as_mode.first,

-                      pc->fc.i8x8_mode_prob);

+    if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {

+      const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;

+      const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

+      const int bsl = MIN(bwl, bhl);

+      write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]);

     } else {

-      write_uv_mode(bc, mi->uv_mode,

-                    pc->fc.uv_mode_prob[mode]);

+      int idx, idy;

+      int bw = 1 << b_width_log2(mi->sb_type);

+      int bh = 1 << b_height_log2(mi->sb_type);

+      for (idy = 0; idy < 2; idy += bh)

+        for (idx = 0; idx < 2; idx += bw) {

+          MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode.first;

+          write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);

+        }

+    write_intra_mode(bc, mi->uv_mode,

+                     pc->fc.uv_mode_prob[mode]);

   } else {

-    vp9_prob mv_ref_p[VP9_MVREFS - 1];

+    vp9_prob mv_ref_p[VP9_INTER_MODES - 1];

+    encode_ref_frame(cpi, bc);

     vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);

 #ifdef ENTROPY_STATS

@@ -854,156 +720,63 @@

     // If segment skip is not enabled code the mode.

     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

-      if (mi->sb_type) {

+      if (mi->sb_type >= BLOCK_SIZE_SB8X8) {

         write_sb_mv_ref(bc, mode, mv_ref_p);

-      } else {

-        write_mv_ref(bc, mode, mv_ref_p);

+        vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);

-      vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);

-    if (mode >= NEARESTMV && mode <= SPLITMV) {

-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-        write_token(bc, vp9_switchable_interp_tree,

-                    vp9_get_pred_probs(&cpi->common, xd,

-                                       PRED_SWITCHABLE_INTERP),

-                    vp9_switchable_interp_encodings +

-                    vp9_switchable_interp_map[mi->interp_filter]);

-      } else {

-        assert(mi->interp_filter == cpi->common.mcomp_filter_type);

-      }

+    if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+      write_token(bc, vp9_switchable_interp_tree,

+                  vp9_get_pred_probs(&cpi->common, xd,

+                                     PRED_SWITCHABLE_INTERP),

+                  vp9_switchable_interp_encodings +

+                  vp9_switchable_interp_map[mi->interp_filter]);

+    } else {

+      assert(mi->interp_filter == cpi->common.mcomp_filter_type);

-    // does the feature use compound prediction or not

-    // (if not specified at the frame/segment level)

-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

-      vp9_write(bc, mi->second_ref_frame > INTRA_FRAME,

-                vp9_get_pred_prob(pc, xd, PRED_COMP));

-    }

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (cpi->common.use_interintra &&

-        mode >= NEARESTMV && mode < SPLITMV &&

-        mi->second_ref_frame <= INTRA_FRAME) {

-      vp9_write(bc, mi->second_ref_frame == INTRA_FRAME,

-                pc->fc.interintra_prob);

-      // if (!cpi->dummy_packing)

-      //   printf("-- %d (%d)\n", mi->second_ref_frame == INTRA_FRAME,

-      //          pc->fc.interintra_prob);

-      if (mi->second_ref_frame == INTRA_FRAME) {

-        // if (!cpi->dummy_packing)

-        //   printf("** %d %d\n", mi->interintra_mode,

-        // mi->interintra_uv_mode);

-        write_ymode(bc, mi->interintra_mode, pc->fc.ymode_prob);

-#if SEPARATE_INTERINTRA_UV

-        write_uv_mode(bc, mi->interintra_uv_mode,

-                      pc->fc.uv_mode_prob[mi->interintra_mode]);

-#endif

-      }

-    }

-#endif

-#if CONFIG_NEW_MVREF

-    // if ((mode == NEWMV) || (mode == SPLITMV)) {

-    if (mode == NEWMV) {

-      // Encode the index of the choice.

-      vp9_write_mv_ref_id(bc,

-                          xd->mb_mv_ref_probs[rf], mi->best_index);

-      if (mi->second_ref_frame > 0) {

-        // Encode the index of the choice.

-        vp9_write_mv_ref_id(

-                            bc, xd->mb_mv_ref_probs[mi->second_ref_frame],

-                            mi->best_second_index);

-      }

-    }

-#endif

-    switch (mode) { /* new, split require MVs */

-      case NEWMV:

-#ifdef ENTROPY_STATS

-        active_section = 5;

-#endif

-        write_nmv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv,

-                  (const nmv_context*) nmvc,

-                  xd->allow_high_precision_mv);

-        if (mi->second_ref_frame > 0) {

-          write_nmv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv,

-                    (const nmv_context*) nmvc,

-                    xd->allow_high_precision_mv);

-        }

-        break;

-      case SPLITMV: {

-        int j = 0;

-#ifdef MODE_STATS

-        ++count_mb_seg[mi->partitioning];

-#endif

-        write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);

-        cpi->mbsplit_count[mi->partitioning]++;

-        do {

-          B_PREDICTION_MODE blockmode;

-          int_mv blockmv;

-          const int *const  L = vp9_mbsplits[mi->partitioning];

-          int k = -1;  /* first block in subset j */

-          int mv_contz;

-          int_mv leftmv, abovemv;

+    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {

+      int j;

+      MB_PREDICTION_MODE blockmode;

+      int_mv blockmv;

+      int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;

+      int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;

+      int idx, idy;

+      for (idy = 0; idy < 2; idy += bh) {

+        for (idx = 0; idx < 2; idx += bw) {

+          j = idy * 2 + idx;

           blockmode = cpi->mb.partition_info->bmi[j].mode;

           blockmv = cpi->mb.partition_info->bmi[j].mv;

-#if CONFIG_DEBUG

-          while (j != L[++k])

-            if (k >= 16)

-              assert(0);

-#else

-          while (j != L[++k]);

-#endif

-          leftmv.as_int = left_block_mv(xd, m, k);

-          abovemv.as_int = above_block_mv(m, k, mis);

-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);

-          write_sub_mv_ref(bc, blockmode,

-                           cpi->common.fc.sub_mv_ref_prob[mv_contz]);

-          cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;

-          if (blockmode == NEW4X4) {

+          write_sb_mv_ref(bc, blockmode, mv_ref_p);

+          vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]);

+          if (blockmode == NEWMV) {

 #ifdef ENTROPY_STATS

             active_section = 11;

 #endif

-            write_nmv(cpi, bc, &blockmv.as_mv, &mi->best_mv,

-                      (const nmv_context*) nmvc,

-                      xd->allow_high_precision_mv);

+            vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,

+                          nmvc, xd->allow_high_precision_mv);

-            if (mi->second_ref_frame > 0) {

-              write_nmv(cpi, bc,

-                        &cpi->mb.partition_info->bmi[j].second_mv.as_mv,

-                        &mi->best_second_mv,

-                        (const nmv_context*) nmvc,

-                        xd->allow_high_precision_mv);

-            }

+            if (mi->ref_frame[1] > INTRA_FRAME)

+              vp9_encode_mv(bc,

+                            &cpi->mb.partition_info->bmi[j].second_mv.as_mv,

+                            &mi->best_second_mv.as_mv,

+                            nmvc, xd->allow_high_precision_mv);

-        } while (++j < cpi->mb.partition_info->count);

-        break;

+        }

-      default:

-        break;

-    }

-  }

+    } else if (mode == NEWMV) {

+#ifdef ENTROPY_STATS

+      active_section = 5;

+#endif

+      vp9_encode_mv(bc,

+                    &mi->mv[0].as_mv, &mi->best_mv.as_mv,

+                    nmvc, xd->allow_high_precision_mv);

-  if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||

-       (rf != INTRA_FRAME && !(mode == SPLITMV &&

-                               mi->partitioning == PARTITIONING_4X4))) &&

-      pc->txfm_mode == TX_MODE_SELECT &&

-      !((pc->mb_no_coeff_skip && skip_coeff) ||

-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

-    TX_SIZE sz = mi->txfm_size;

-    // FIXME(rbultje) code ternary symbol once all experiments are merged

-    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);

-    if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {

-      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);

-      if (mi->sb_type && sz != TX_8X8)

-        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);

+      if (mi->ref_frame[1] > INTRA_FRAME)

+        vp9_encode_mv(bc,

+                      &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,

+                      nmvc, xd->allow_high_precision_mv);

@@ -1010,726 +783,206 @@

 static void write_mb_modes_kf(const VP9_COMP *cpi,

                               MODE_INFO *m,

-                              vp9_writer *bc,

-                              int mb_rows_left, int mb_cols_left) {

+                              vp9_writer *bc, int mi_row, int mi_col) {

   const VP9_COMMON *const c = &cpi->common;

   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  const int mis = c->mode_info_stride;

   const int ym = m->mbmi.mode;

+  const int mis = c->mode_info_stride;

   const int segment_id = m->mbmi.segment_id;

   int skip_coeff;

-  if (xd->update_mb_segmentation_map) {

+  if (xd->update_mb_segmentation_map)

     write_mb_segid(bc, &m->mbmi, xd);

-  }

-  if (!c->mb_no_coeff_skip) {

-    skip_coeff = 0;

-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

     skip_coeff = 1;

   } else {

     skip_coeff = m->mbmi.mb_skip_coeff;

-    vp9_write(bc, skip_coeff,

-              vp9_get_pred_prob(c, xd, PRED_MBSKIP));

+    vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));

-  if (m->mbmi.sb_type) {

-    sb_kfwrite_ymode(bc, ym,

-                     c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);

-  } else {

-    kfwrite_ymode(bc, ym,

-                  c->kf_ymode_prob[c->kf_ymode_probs_index]);

+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {

+    TX_SIZE sz = m->mbmi.txfm_size;

+    const vp9_prob *tx_probs = vp9_get_pred_probs(c, xd, PRED_TX_SIZE);

+    vp9_write(bc, sz != TX_4X4, tx_probs[0]);

+    if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {

+      vp9_write(bc, sz != TX_8X8, tx_probs[1]);

+      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)

+        vp9_write(bc, sz != TX_16X16, tx_probs[2]);

+    }

-  if (ym == B_PRED) {

-    int i = 0;

-    do {

-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);

-      const B_PREDICTION_MODE L = (xd->left_available || (i & 3)) ?

-                                  left_block_mode(m, i) : B_DC_PRED;

-      const int bm = m->bmi[i].as_mode.first;

+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {

+    const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);

+    const MB_PREDICTION_MODE L = xd->left_available ?

+                                 left_block_mode(m, 0) : DC_PRED;

+    write_intra_mode(bc, ym, c->kf_y_mode_prob[A][L]);

+  } else {

+    int idx, idy;

+    int bw = 1 << b_width_log2(m->mbmi.sb_type);

+    int bh = 1 << b_height_log2(m->mbmi.sb_type);

+    for (idy = 0; idy < 2; idy += bh) {

+      for (idx = 0; idx < 2; idx += bw) {

+        int i = idy * 2 + idx;

+        const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);

+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?

+                                     left_block_mode(m, i) : DC_PRED;

+        const int bm = m->bmi[i].as_mode.first;

 #ifdef ENTROPY_STATS

-      ++intra_mode_stats [A] [L] [bm];

+        ++intra_mode_stats[A][L][bm];

 #endif

-      write_kf_bmode(bc, bm, c->kf_bmode_prob[A][L]);

-    } while (++i < 16);

-  }

-  if (ym == I8X8_PRED) {

-    write_i8x8_mode(bc, m->bmi[0].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);

-    write_i8x8_mode(bc, m->bmi[2].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);

-    write_i8x8_mode(bc, m->bmi[8].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);

-    write_i8x8_mode(bc, m->bmi[10].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);

-  } else

-    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);

-  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&

-      !((c->mb_no_coeff_skip && skip_coeff) ||

-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

-    TX_SIZE sz = m->mbmi.txfm_size;

-    // FIXME(rbultje) code ternary symbol once all experiments are merged

-    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);

-    if (sz != TX_4X4 && ym <= TM_PRED) {

-      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);

-      if (m->mbmi.sb_type && sz != TX_8X8)

-        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);

+        write_intra_mode(bc, bm, c->kf_y_mode_prob[A][L]);

+      }

+  write_intra_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);

-#if CONFIG_CODE_NONZEROCOUNT

-static void write_nzc(VP9_COMMON *const cm,

-                      uint16_t nzc,

-                      int nzc_context,

-                      TX_SIZE tx_size,

-                      int ref,

-                      int type,

-                      vp9_writer* const bc) {

-  int c, e;

-  c = codenzc(nzc);

-  if (tx_size == TX_32X32) {

-    write_token(bc, vp9_nzc32x32_tree,

-                cm->fc.nzc_probs_32x32[nzc_context][ref][type],

-                vp9_nzc32x32_encodings + c);

-    // cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;

-  } else if (tx_size == TX_16X16) {

-    write_token(bc, vp9_nzc16x16_tree,

-                cm->fc.nzc_probs_16x16[nzc_context][ref][type],

-                vp9_nzc16x16_encodings + c);

-    // cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;

-  } else if (tx_size == TX_8X8) {

-    write_token(bc, vp9_nzc8x8_tree,

-                cm->fc.nzc_probs_8x8[nzc_context][ref][type],

-                vp9_nzc8x8_encodings + c);

-    // cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;

-  } else if (tx_size == TX_4X4) {

-    write_token(bc, vp9_nzc4x4_tree,

-                cm->fc.nzc_probs_4x4[nzc_context][ref][type],

-                vp9_nzc4x4_encodings + c);

-    // cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;

+static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,

+                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,

+                          int mi_row, int mi_col) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

+  if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8)

+    if (xd->ab_index > 0)

+      return;

+  xd->mode_info_context = m;

+  set_mi_row_col(&cpi->common, xd, mi_row,

+                 1 << mi_height_log2(m->mbmi.sb_type),

+                 mi_col, 1 << mi_width_log2(m->mbmi.sb_type));

+  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {

+    write_mb_modes_kf(cpi, m, bc, mi_row, mi_col);

+#ifdef ENTROPY_STATS

+    active_section = 8;

+#endif

   } else {

-    assert(0);

+    pack_inter_mode_mvs(cpi, m, bc, mi_row, mi_col);

+#ifdef ENTROPY_STATS

+    active_section = 1;

+#endif

-  if ((e = vp9_extranzcbits[c])) {

-    int x = nzc - vp9_basenzcvalue[c];

-    while (e--) {

-      int b = (x >> e) & 1;

-      vp9_write(bc, b,

-                cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);

-      // cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;

-    }

-  }

+  assert(*tok < tok_end);

+  pack_mb_tokens(bc, tok, tok_end);

-static void write_nzcs_sb64(VP9_COMP *cpi,

-                            MACROBLOCKD *xd,

-                            int mb_row,

-                            int mb_col,

-                            vp9_writer* const bc) {

+static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,

+                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,

+                           int mi_row, int mi_col,

+                           BLOCK_SIZE_TYPE bsize) {

   VP9_COMMON *const cm = &cpi->common;

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  const int mis = cm->mode_info_stride;

+  int bwl, bhl;

+  int bsl = b_width_log2(bsize);

+  int bs = (1 << bsl) / 4;  // mode_info step for subsize

+  int n;

+  PARTITION_TYPE partition;

+  BLOCK_SIZE_TYPE subsize;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  if (mi->mb_skip_coeff)

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

     return;

-  switch (mi->txfm_size) {

-    case TX_32X32:

-      for (j = 0; j < 256; j += 64) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);

-      }

-      for (j = 256; j < 384; j += 64) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1, bc);

-      }

-      break;

+  bwl = b_width_log2(m->mbmi.sb_type);

+  bhl = b_height_log2(m->mbmi.sb_type);

-    case TX_16X16:

-      for (j = 0; j < 256; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);

-      }

-      for (j = 256; j < 384; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);

-      }

-      break;

+  // parse the partition type

+  if ((bwl == bsl) && (bhl == bsl))

+    partition = PARTITION_NONE;

+  else if ((bwl == bsl) && (bhl < bsl))

+    partition = PARTITION_HORZ;

+  else if ((bwl < bsl) && (bhl == bsl))

+    partition = PARTITION_VERT;

+  else if ((bwl < bsl) && (bhl < bsl))

+    partition = PARTITION_SPLIT;

+  else

+    assert(0);

-    case TX_8X8:

-      for (j = 0; j < 256; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);

-      }

-      for (j = 256; j < 384; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

-      }

-      break;

+  if (bsize < BLOCK_SIZE_SB8X8)

+    if (xd->ab_index > 0)

+      return;

-    case TX_4X4:

-      for (j = 0; j < 256; ++j) {

-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);

-      }

-      for (j = 256; j < 384; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

-      }

-      break;

-    default:

-      break;

+  if (bsize >= BLOCK_SIZE_SB8X8) {

+    int pl;

+    int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);

+    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);

+    xd->above_seg_context = cm->above_seg_context + mi_col;

+    pl = partition_plane_context(xd, bsize);

+    // encode the partition information

+    if (idx == 0)

+      write_token(bc, vp9_partition_tree,

+                  cm->fc.partition_prob[cm->frame_type][pl],

+                  vp9_partition_encodings + partition);

+    else if (idx > 0)

+      vp9_write(bc, partition == PARTITION_SPLIT,

+                cm->fc.partition_prob[cm->frame_type][pl][idx]);

-}

-static void write_nzcs_sb32(VP9_COMP *cpi,

-                            MACROBLOCKD *xd,

-                            int mb_row,

-                            int mb_col,

-                            vp9_writer* const bc) {

-  VP9_COMMON *const cm = &cpi->common;

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  subsize = get_subsize(bsize, partition);

+  *(get_sb_index(xd, subsize)) = 0;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_32X32:

-      for (j = 0; j < 64; j += 64) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);

-      }

-      for (j = 64; j < 96; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);

-      }

+  switch (partition) {

+    case PARTITION_NONE:

+      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);

       break;

-    case TX_16X16:

-      for (j = 0; j < 64; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);

-      }

-      for (j = 64; j < 96; j += 16) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);

-      }

+    case PARTITION_HORZ:

+      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);

+      *(get_sb_index(xd, subsize)) = 1;

+      if ((mi_row + bs) < cm->mi_rows)

+        write_modes_b(cpi, m + bs * mis, bc, tok, tok_end, mi_row + bs, mi_col);

       break;

-    case TX_8X8:

-      for (j = 0; j < 64; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);

-      }

-      for (j = 64; j < 96; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

-      }

+    case PARTITION_VERT:

+      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);

+      *(get_sb_index(xd, subsize)) = 1;

+      if ((mi_col + bs) < cm->mi_cols)

+        write_modes_b(cpi, m + bs, bc, tok, tok_end, mi_row, mi_col + bs);

       break;

-    case TX_4X4:

-      for (j = 0; j < 64; ++j) {

-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);

+    case PARTITION_SPLIT:

+      for (n = 0; n < 4; n++) {

+        int j = n >> 1, i = n & 0x01;

+        *(get_sb_index(xd, subsize)) = n;

+        write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,

+                       mi_row + j * bs, mi_col + i * bs, subsize);

-      for (j = 64; j < 96; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

-      }

       break;

     default:

-      break;

+      assert(0);

-}

-static void write_nzcs_mb16(VP9_COMP *cpi,

-                            MACROBLOCKD *xd,

-                            int mb_row,

-                            int mb_col,

-                            vp9_writer* const bc) {

-  VP9_COMMON *const cm = &cpi->common;

-  MODE_INFO *m = xd->mode_info_context;

-  MB_MODE_INFO *const mi = &m->mbmi;

-  int j, nzc_context;

-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

-  assert(mb_col == get_mb_col(xd));

-  assert(mb_row == get_mb_row(xd));

-  if (mi->mb_skip_coeff)

-    return;

-  switch (mi->txfm_size) {

-    case TX_16X16:

-      for (j = 0; j < 16; j += 16) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);

-      }

-      for (j = 16; j < 24; j += 4) {

-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

-      }

-      break;

-    case TX_8X8:

-      for (j = 0; j < 16; j += 4) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);

-      }

-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {

-        for (j = 16; j < 24; ++j) {

-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

-        }

-      } else {

-        for (j = 16; j < 24; j += 4) {

-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

-        }

-      }

-      break;

-    case TX_4X4:

-      for (j = 0; j < 16; ++j) {

-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);

-      }

-      for (j = 16; j < 24; ++j) {

-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

-      }

-      break;

-    default:

-      break;

+  // update partition context

+  if (bsize >= BLOCK_SIZE_SB8X8 &&

+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {

+    set_partition_seg_context(cm, xd, mi_row, mi_col);

+    update_partition_context(xd, subsize, bsize);

-#ifdef NZC_STATS

-void init_nzcstats() {

-  vp9_zero(nzc_stats_4x4);

-  vp9_zero(nzc_stats_8x8);

-  vp9_zero(nzc_stats_16x16);

-  vp9_zero(nzc_stats_32x32);

-  vp9_zero(nzc_pcat_stats);

-}

-void update_nzcstats(VP9_COMMON *const cm) {

-  int c, r, b, t;

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        for (t = 0; t < NZC4X4_TOKENS; ++t) {

-          nzc_stats_4x4[c][r][b][t] += cm->fc.nzc_counts_4x4[c][r][b][t];

-        }

-      }

-    }

-  }

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        for (t = 0; t < NZC8X8_TOKENS; ++t) {

-          nzc_stats_8x8[c][r][b][t] += cm->fc.nzc_counts_8x8[c][r][b][t];

-        }

-      }

-    }

-  }

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        for (t = 0; t < NZC16X16_TOKENS; ++t) {

-          nzc_stats_16x16[c][r][b][t] += cm->fc.nzc_counts_16x16[c][r][b][t];

-        }

-      }

-    }

-  }

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        for (t = 0; t < NZC32X32_TOKENS; ++t) {

-          nzc_stats_32x32[c][r][b][t] += cm->fc.nzc_counts_32x32[c][r][b][t];

-        }

-      }

-    }

-  }

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

-      for (b = 0; b < bits; ++b) {

-        nzc_pcat_stats[c][t][b][0] += cm->fc.nzc_pcat_counts[c][t][b][0];

-        nzc_pcat_stats[c][t][b][1] += cm->fc.nzc_pcat_counts[c][t][b][1];

-      }

-    }

-  }

-}

-void print_nzcstats() {

-  int c, r, b, t;

-  FILE *f;

-  printf(

-    "static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]\n"

-    "                                                [REF_TYPES]\n"

-    "                                                [BLOCK_TYPES]\n"

-    "                                                [NZC4X4_TOKENS] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        printf("      {");

-        for (t = 0; t < NZC4X4_TOKENS; ++t) {

-          printf(" %-3d,", nzc_stats_4x4[c][r][b][t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]\n"

-    "                                                [REF_TYPES]\n"

-    "                                                [BLOCK_TYPES]\n"

-    "                                                [NZC8X8_TOKENS] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        printf("      {");

-        for (t = 0; t < NZC8X8_TOKENS; ++t) {

-          printf(" %-3d,", nzc_stats_8x8[c][r][b][t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]\n"

-    "                                                  [REF_TYPES]\n"

-    "                                                  [BLOCK_TYPES]\n"

-    "                                                  [NZC16X16_TOKENS] = {"

-    "\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        printf("      {");

-        for (t = 0; t < NZC16X16_TOKENS; ++t) {

-          printf(" %-3d,", nzc_stats_16x16[c][r][b][t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]\n"

-    "                                                  [REF_TYPES]\n"

-    "                                                  [BLOCK_TYPES]\n"

-    "                                                  [NZC32X32_TOKENS] = {"

-    "\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        printf("      {");

-        for (t = 0; t < NZC32X32_TOKENS; ++t) {

-          printf(" %-3d,", nzc_stats_32x32[c][r][b][t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const vp9_prob default_nzc_pcat_counts[MAX_NZC_CONTEXTS]\n"

-    "                                             [NZC_TOKENS_EXTRA]\n"

-    "                                             [NZC_BITS_EXTRA] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

-      printf("    {");

-      for (b = 0; b < NZC_BITS_EXTRA; ++b) {

-        printf(" %d/%d,",

-               nzc_pcat_stats[c][t][b][0], nzc_pcat_stats[c][t][b][1]);

-      }

-      printf(" },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]\n"

-    "                                           [REF_TYPES]\n"

-    "                                           [BLOCK_TYPES]\n"

-    "                                           [NZC4X4_TOKENS] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        vp9_prob probs[NZC4X4_NODES];

-        unsigned int branch_ct[NZC4X4_NODES][2];

-        vp9_tree_probs_from_distribution(vp9_nzc4x4_tree,

-                                         probs, branch_ct,

-                                         nzc_stats_4x4[c][r][b], 0);

-        printf("      {");

-        for (t = 0; t < NZC4X4_NODES; ++t) {

-          printf(" %-3d,", probs[t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]\n"

-    "                                           [REF_TYPES]\n"

-    "                                           [BLOCK_TYPES]\n"

-    "                                           [NZC8X8_TOKENS] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        vp9_prob probs[NZC8X8_NODES];

-        unsigned int branch_ct[NZC8X8_NODES][2];

-        vp9_tree_probs_from_distribution(vp9_nzc8x8_tree,

-                                         probs, branch_ct,

-                                         nzc_stats_8x8[c][r][b], 0);

-        printf("      {");

-        for (t = 0; t < NZC8X8_NODES; ++t) {

-          printf(" %-3d,", probs[t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]\n"

-    "                                             [REF_TYPES]\n"

-    "                                             [BLOCK_TYPES]\n"

-    "                                             [NZC16X16_TOKENS] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        vp9_prob probs[NZC16X16_NODES];

-        unsigned int branch_ct[NZC16X16_NODES][2];

-        vp9_tree_probs_from_distribution(vp9_nzc16x16_tree,

-                                         probs, branch_ct,

-                                         nzc_stats_16x16[c][r][b], 0);

-        printf("      {");

-        for (t = 0; t < NZC16X16_NODES; ++t) {

-          printf(" %-3d,", probs[t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]\n"

-    "                                             [REF_TYPES]\n"

-    "                                             [BLOCK_TYPES]\n"

-    "                                             [NZC32X32_TOKENS] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (r = 0; r < REF_TYPES; ++r) {

-      printf("    {\n");

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        vp9_prob probs[NZC32X32_NODES];

-        unsigned int branch_ct[NZC32X32_NODES][2];

-        vp9_tree_probs_from_distribution(vp9_nzc32x32_tree,

-                                         probs, branch_ct,

-                                         nzc_stats_32x32[c][r][b], 0);

-        printf("      {");

-        for (t = 0; t < NZC32X32_NODES; ++t) {

-          printf(" %-3d,", probs[t]);

-        }

-        printf(" },\n");

-      }

-      printf("    },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  printf(

-    "static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]\n"

-    "                                            [NZC_TOKENS_EXTRA]\n"

-    "                                            [NZC_BITS_EXTRA] = {\n");

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    printf("  {\n");

-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

-      printf("    {");

-      for (b = 0; b < NZC_BITS_EXTRA; ++b) {

-        vp9_prob prob = get_binary_prob(nzc_pcat_stats[c][t][b][0],

-                                        nzc_pcat_stats[c][t][b][1]);

-        printf(" %-3d,", prob);

-      }

-      printf(" },\n");

-    }

-    printf("  },\n");

-  }

-  printf("};\n");

-  f = fopen("nzcstats.bin", "wb");

-  fwrite(nzc_stats_4x4, sizeof(nzc_stats_4x4), 1, f);

-  fwrite(nzc_stats_8x8, sizeof(nzc_stats_8x8), 1, f);

-  fwrite(nzc_stats_16x16, sizeof(nzc_stats_16x16), 1, f);

-  fwrite(nzc_stats_32x32, sizeof(nzc_stats_32x32), 1, f);

-  fwrite(nzc_pcat_stats, sizeof(nzc_pcat_stats), 1, f);

-  fclose(f);

-}

-#endif

-#endif  // CONFIG_CODE_NONZEROCOUNT

-static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,

-                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,

-                          int mb_row, int mb_col) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  xd->mode_info_context = m;

-  set_mb_row(&cpi->common, xd, mb_row, (1 << m->mbmi.sb_type));

-  set_mb_col(&cpi->common, xd, mb_col, (1 << m->mbmi.sb_type));

-  if (cm->frame_type == KEY_FRAME) {

-    write_mb_modes_kf(cpi, m, bc,

-                      cm->mb_rows - mb_row, cm->mb_cols - mb_col);

-#ifdef ENTROPY_STATS

-    active_section = 8;

-#endif

-  } else {

-    pack_inter_mode_mvs(cpi, m, bc,

-                        cm->mb_rows - mb_row, cm->mb_cols - mb_col);

-#ifdef ENTROPY_STATS

-    active_section = 1;

-#endif

-  }

-#if CONFIG_CODE_NONZEROCOUNT

-  if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64)

-    write_nzcs_sb64(cpi, xd, mb_row, mb_col, bc);

-  else if (m->mbmi.sb_type == BLOCK_SIZE_SB32X32)

-    write_nzcs_sb32(cpi, xd, mb_row, mb_col, bc);

-  else

-    write_nzcs_mb16(cpi, xd, mb_row, mb_col, bc);

-#endif

-  assert(*tok < tok_end);

-  pack_mb_tokens(bc, tok, tok_end);

-}

 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,

                         TOKENEXTRA **tok, TOKENEXTRA *tok_end) {

   VP9_COMMON *const c = &cpi->common;

   const int mis = c->mode_info_stride;

   MODE_INFO *m, *m_ptr = c->mi;

-  int i, mb_row, mb_col;

+  int mi_row, mi_col;

-  m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis;

-  for (mb_row = c->cur_tile_mb_row_start;

-       mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) {

-    m = m_ptr;

-    for (mb_col = c->cur_tile_mb_col_start;

-         mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4) {

-      vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded);

-      if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-        write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col);

-      } else {

-        int j;

+  m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;

+  vpx_memset(c->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *

+             mi_cols_aligned_to_sb(c));

-        for (j = 0; j < 4; j++) {

-          const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;

-          MODE_INFO *sb_m = m + y_idx_sb * mis + x_idx_sb;

-          if (mb_col + x_idx_sb >= c->mb_cols ||

-              mb_row + y_idx_sb >= c->mb_rows)

-            continue;

-          vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded);

-          if (sb_m->mbmi.sb_type) {

-            assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32);

-            write_modes_b(cpi, sb_m, bc, tok, tok_end,

-                          mb_row + y_idx_sb, mb_col + x_idx_sb);

-          } else {

-            // Process the 4 MBs in the order:

-            // top-left, top-right, bottom-left, bottom-right

-            for (i = 0; i < 4; i++) {

-              const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);

-              MODE_INFO *mb_m = m + x_idx + y_idx * mis;

-              if (mb_row + y_idx >= c->mb_rows ||

-                  mb_col + x_idx >= c->mb_cols) {

-                // MB lies outside frame, move on

-                continue;

-              }

-              assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16);

-              write_modes_b(cpi, mb_m, bc, tok, tok_end,

-                            mb_row + y_idx, mb_col + x_idx);

-            }

-          }

-        }

-      }

-    }

+  for (mi_row = c->cur_tile_mi_row_start;

+       mi_row < c->cur_tile_mi_row_end;

+       mi_row += 8, m_ptr += 8 * mis) {

+    m = m_ptr;

+    vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));

+    for (mi_col = c->cur_tile_mi_col_start;

+         mi_col < c->cur_tile_mi_col_end;

+         mi_col += 64 / MI_SIZE, m += 64 / MI_SIZE)

+      write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,

+                     BLOCK_SIZE_SB64X64);

 /* This function is used for debugging probability trees. */

 static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {

   /* print coef probability tree */

@@ -1759,23 +1012,16 @@

   fclose(f);

-static void build_tree_distribution(vp9_coeff_probs *coef_probs,

-                                    vp9_coeff_count *coef_counts,

-                                    unsigned int (*eob_branch_ct)[REF_TYPES]

-                                                                 [COEF_BANDS]

-                                                          [PREV_COEF_CONTEXTS],

-#ifdef ENTROPY_STATS

-                                    VP9_COMP *cpi,

-                                    vp9_coeff_accum *context_counters,

-#endif

-                                    vp9_coeff_stats *coef_branch_ct,

-                                    int block_types) {

+static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {

+  vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[txfm_size];

+  vp9_coeff_count *coef_counts = cpi->coef_counts[txfm_size];

+  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =

+      cpi->common.fc.eob_branch_counts[txfm_size];

+  vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[txfm_size];

+  vp9_prob full_probs[ENTROPY_NODES];

   int i, j, k, l;

-#ifdef ENTROPY_STATS

-  int t = 0;

-#endif

-  for (i = 0; i < block_types; ++i) {

+  for (i = 0; i < BLOCK_TYPES; ++i) {

     for (j = 0; j < REF_TYPES; ++j) {

       for (k = 0; k < COEF_BANDS; ++k) {

         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

@@ -1782,19 +1028,31 @@

           if (l >= 3 && k == 0)

             continue;

           vp9_tree_probs_from_distribution(vp9_coef_tree,

-                                           coef_probs[i][j][k][l],

+                                           full_probs,

                                            coef_branch_ct[i][j][k][l],

                                            coef_counts[i][j][k][l], 0);

+          vpx_memcpy(coef_probs[i][j][k][l], full_probs,

+                     sizeof(vp9_prob) * UNCONSTRAINED_NODES);

+#if CONFIG_BALANCED_COEFTREE

+          coef_branch_ct[i][j][k][l][1][1] = eob_branch_ct[i][j][k][l] -

+                                             coef_branch_ct[i][j][k][l][1][0];

+          coef_probs[i][j][k][l][1] =

+              get_binary_prob(coef_branch_ct[i][j][k][l][1][0],

+                              coef_branch_ct[i][j][k][l][1][1]);

+#else

           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -

                                              coef_branch_ct[i][j][k][l][0][0];

           coef_probs[i][j][k][l][0] =

               get_binary_prob(coef_branch_ct[i][j][k][l][0][0],

                               coef_branch_ct[i][j][k][l][0][1]);

+#endif

 #ifdef ENTROPY_STATS

           if (!cpi->dummy_packing) {

+            int t;

             for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-              context_counters[i][j][k][l][t] += coef_counts[i][j][k][l][t];

-            context_counters[i][j][k][l][MAX_ENTROPY_TOKENS] +=

+              context_counters[txfm_size][i][j][k][l][t] +=

+                  coef_counts[i][j][k][l][t];

+            context_counters[txfm_size][i][j][k][l][MAX_ENTROPY_TOKENS] +=

                 eob_branch_ct[i][j][k][l];

 #endif

@@ -1805,301 +1063,45 @@

 static void build_coeff_contexts(VP9_COMP *cpi) {

-  build_tree_distribution(cpi->frame_coef_probs_4x4,

-                          cpi->coef_counts_4x4,

-                          cpi->common.fc.eob_branch_counts[TX_4X4],

-#ifdef ENTROPY_STATS

-                          cpi, context_counters_4x4,

-#endif

-                          cpi->frame_branch_ct_4x4, BLOCK_TYPES);

-  build_tree_distribution(cpi->frame_coef_probs_8x8,

-                          cpi->coef_counts_8x8,

-                          cpi->common.fc.eob_branch_counts[TX_8X8],

-#ifdef ENTROPY_STATS

-                          cpi, context_counters_8x8,

-#endif

-                          cpi->frame_branch_ct_8x8, BLOCK_TYPES);

-  build_tree_distribution(cpi->frame_coef_probs_16x16,

-                          cpi->coef_counts_16x16,

-                          cpi->common.fc.eob_branch_counts[TX_16X16],

-#ifdef ENTROPY_STATS

-                          cpi, context_counters_16x16,

-#endif

-                          cpi->frame_branch_ct_16x16, BLOCK_TYPES);

-  build_tree_distribution(cpi->frame_coef_probs_32x32,

-                          cpi->coef_counts_32x32,

-                          cpi->common.fc.eob_branch_counts[TX_32X32],

-#ifdef ENTROPY_STATS

-                          cpi, context_counters_32x32,

-#endif

-                          cpi->frame_branch_ct_32x32, BLOCK_TYPES);

+  TX_SIZE t;

+  for (t = TX_4X4; t <= TX_32X32; t++)

+    build_tree_distribution(cpi, t);

-#if CONFIG_CODE_NONZEROCOUNT

-static void update_nzc_probs_common(VP9_COMP* cpi,

-                                    vp9_writer* const bc,

-                                    int block_size) {

-  VP9_COMMON *cm = &cpi->common;

-  int c, r, b, t;

-  int update[2] = {0, 0};

-  int savings = 0;

-  int tokens, nodes;

-  const vp9_tree_index *nzc_tree;

-  vp9_prob *new_nzc_probs;

-  vp9_prob *old_nzc_probs;

-  unsigned int *nzc_counts;

-  unsigned int (*nzc_branch_ct)[2];

-  vp9_prob upd;

-  if (block_size == 32) {

-    tokens = NZC32X32_TOKENS;

-    nzc_tree = vp9_nzc32x32_tree;

-    old_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];

-    new_nzc_probs = cpi->frame_nzc_probs_32x32[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];

-    nzc_branch_ct = cpi->frame_nzc_branch_ct_32x32[0][0][0];

-    upd = NZC_UPDATE_PROB_32X32;

-  } else if (block_size == 16) {

-    tokens = NZC16X16_TOKENS;

-    nzc_tree = vp9_nzc16x16_tree;

-    old_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];

-    new_nzc_probs = cpi->frame_nzc_probs_16x16[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];

-    nzc_branch_ct = cpi->frame_nzc_branch_ct_16x16[0][0][0];

-    upd = NZC_UPDATE_PROB_16X16;

-  } else if (block_size == 8) {

-    tokens = NZC8X8_TOKENS;

-    nzc_tree = vp9_nzc8x8_tree;

-    old_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];

-    new_nzc_probs = cpi->frame_nzc_probs_8x8[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];

-    nzc_branch_ct = cpi->frame_nzc_branch_ct_8x8[0][0][0];

-    upd = NZC_UPDATE_PROB_8X8;

-  } else {

-    nzc_tree = vp9_nzc4x4_tree;

-    tokens = NZC4X4_TOKENS;

-    old_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];

-    new_nzc_probs = cpi->frame_nzc_probs_4x4[0][0][0];

-    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];

-    nzc_branch_ct = cpi->frame_nzc_branch_ct_4x4[0][0][0];

-    upd = NZC_UPDATE_PROB_4X4;

-  }

-  nodes = tokens - 1;

-  // Get the new probabilities and the branch counts

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

-        int offset_nodes = offset * nodes;

-        int offset_tokens = offset * tokens;

-        vp9_tree_probs_from_distribution(nzc_tree,

-                                         new_nzc_probs + offset_nodes,

-                                         nzc_branch_ct + offset_nodes,

-                                         nzc_counts + offset_tokens, 0);

-      }

-    }

-  }

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

-        int offset_nodes = offset * nodes;

-        for (t = 0; t < nodes; ++t) {

-          vp9_prob newp = new_nzc_probs[offset_nodes + t];

-          vp9_prob oldp = old_nzc_probs[offset_nodes + t];

-          int s, u = 0;

-#if defined(SEARCH_NEWP)

-            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],

-                                                oldp, &newp, upd);

-            if (s > 0 && newp != oldp)

-              u = 1;

-            if (u)

-              savings += s - (int)(vp9_cost_zero(upd));

-            else

-              savings -= (int)(vp9_cost_zero(upd));

-#else

-          s = prob_update_savings(nzc_branch_ct[offset_nodes],

-                                  oldp, newp, upd);

-          if (s > 0)

-            u = 1;

-          if (u)

-            savings += s;

-#endif

-          update[u]++;

-        }

-      }

-    }

-  }

-  if (update[1] == 0 || savings < 0) {

-    vp9_write_bit(bc, 0);

-  } else {

-    vp9_write_bit(bc, 1);

-    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-      for (r = 0; r < REF_TYPES; ++r) {

-        for (b = 0; b < BLOCK_TYPES; ++b) {

-          int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

-          int offset_nodes = offset * nodes;

-          for (t = 0; t < nodes; ++t) {

-            vp9_prob newp = new_nzc_probs[offset_nodes + t];

-            vp9_prob *oldp = &old_nzc_probs[offset_nodes + t];

-            int s, u = 0;

-#if defined(SEARCH_NEWP)

-            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],

-                                                *oldp, &newp, upd);

-            if (s > 0 && newp != *oldp)

-              u = 1;

-#else

-            s = prob_update_savings(nzc_branch_ct[offset_nodes],

-                                    *oldp, newp, upd);

-            if (s > 0)

-              u = 1;

-#endif

-            vp9_write(bc, u, upd);

-            if (u) {

-              /* send/use new probability */

-              write_prob_diff_update(bc, newp, *oldp);

-              *oldp = newp;

-            }

-          }

-        }

-      }

-    }

-  }

-}

-static void update_nzc_pcat_probs(VP9_COMP *cpi, vp9_writer* const bc) {

-  VP9_COMMON *cm = &cpi->common;

-  int c, t, b;

-  int update[2] = {0, 0};

-  int savings = 0;

-  vp9_prob upd = NZC_UPDATE_PROB_PCAT;

-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

-      for (b = 0; b < bits; ++b) {

-        vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],

-                                        cm->fc.nzc_pcat_counts[c][t][b][1]);

-        vp9_prob oldp = cm->fc.nzc_pcat_probs[c][t][b];

-        int s, u = 0;

-#if defined(SEARCH_NEWP)

-        s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],

-                                            oldp, &newp, upd);

-        if (s > 0 && newp != oldp)

-          u = 1;

-        if (u)

-          savings += s - (int)(vp9_cost_zero(upd));

-        else

-          savings -= (int)(vp9_cost_zero(upd));

-#else

-        s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],

-                                oldp, newp, upd);

-        if (s > 0)

-          u = 1;

-        if (u)

-          savings += s;

-#endif

-        update[u]++;

-      }

-    }

-  }

-  if (update[1] == 0 || savings < 0) {

-    vp9_write_bit(bc, 0);

-  } else {

-    vp9_write_bit(bc, 1);

-    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

-      for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

-        int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

-        for (b = 0; b < bits; ++b) {

-          vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],

-                                          cm->fc.nzc_pcat_counts[c][t][b][1]);

-          vp9_prob *oldp = &cm->fc.nzc_pcat_probs[c][t][b];

-          int s, u = 0;

-#if defined(SEARCH_NEWP)

-          s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],

-                                              *oldp, &newp, upd);

-          if (s > 0 && newp != *oldp)

-            u = 1;

-#else

-          s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],

-                                  *oldp, newp, upd);

-          if (s > 0)

-            u = 1;

-#endif

-          vp9_write(bc, u, upd);

-          if (u) {

-            /* send/use new probability */

-            write_prob_diff_update(bc, newp, *oldp);

-            *oldp = newp;

-          }

-        }

-      }

-    }

-  }

-}

-static void update_nzc_probs(VP9_COMP* cpi,

-                             vp9_writer* const bc) {

-  update_nzc_probs_common(cpi, bc, 4);

-  if (cpi->common.txfm_mode != ONLY_4X4)

-    update_nzc_probs_common(cpi, bc, 8);

-  if (cpi->common.txfm_mode > ALLOW_8X8)

-    update_nzc_probs_common(cpi, bc, 16);

-  if (cpi->common.txfm_mode > ALLOW_16X16)

-    update_nzc_probs_common(cpi, bc, 32);

-#ifdef NZC_PCAT_UPDATE

-  update_nzc_pcat_probs(cpi, bc);

-#endif

-#ifdef NZC_STATS

-  if (!cpi->dummy_packing)

-    update_nzcstats(&cpi->common);

-#endif

-}

-#endif  // CONFIG_CODE_NONZEROCOUNT

-static void update_coef_probs_common(vp9_writer* const bc,

-#ifdef ENTROPY_STATS

-                                     VP9_COMP *cpi,

-                                     vp9_coeff_stats *tree_update_hist,

-#endif

-                                     vp9_coeff_probs *new_frame_coef_probs,

-                                     vp9_coeff_probs *old_frame_coef_probs,

-                                     vp9_coeff_stats *frame_branch_ct,

-                                     int block_types) {

+static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,

+                                     TX_SIZE tx_size) {

+  vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size];

+  vp9_coeff_probs_model *old_frame_coef_probs =

+      cpi->common.fc.coef_probs[tx_size];

+  vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];

   int i, j, k, l, t;

   int update[2] = {0, 0};

   int savings;

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

-  const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES;

-#else

-  const int entropy_nodes_update = ENTROPY_NODES;

-#endif

-  // vp9_prob bestupd = find_coef_update_prob(cpi);

+  const int entropy_nodes_update = UNCONSTRAINED_NODES;

+  const int tstart = 0;

   /* dry run to see if there is any udpate at all needed */

   savings = 0;

-  for (i = 0; i < block_types; ++i) {

+  for (i = 0; i < BLOCK_TYPES; ++i) {

     for (j = 0; j < REF_TYPES; ++j) {

       for (k = 0; k < COEF_BANDS; ++k) {

         // int prev_coef_savings[ENTROPY_NODES] = {0};

         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

-          for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) {

+          for (t = tstart; t < entropy_nodes_update; ++t) {

             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];

             const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];

             const vp9_prob upd = vp9_coef_update_prob[t];

-            int s;  // = prev_coef_savings[t];

+            int s;

             int u = 0;

             if (l >= 3 && k == 0)

               continue;

-#if defined(SEARCH_NEWP)

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

-            if (t == UNCONSTRAINED_NODES - 1)

+            if (t == PIVOT_NODE)

               s = prob_diff_update_savings_search_model(

                   frame_branch_ct[i][j][k][l][0],

                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);

             else

-#endif

               s = prob_diff_update_savings_search(

                   frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);

             if (s > 0 && newp != oldp)

@@ -2108,15 +1110,6 @@

               savings += s - (int)(vp9_cost_zero(upd));

             else

               savings -= (int)(vp9_cost_zero(upd));

-#else

-            s = prob_update_savings(frame_branch_ct[i][j][k][l][t],

-                                    oldp, newp, upd);

-            if (s > 0)

-              u = 1;

-            if (u)

-              savings += s;

-#endif

             update[u]++;

@@ -2131,54 +1124,39 @@

     return;

   vp9_write_bit(bc, 1);

-  for (i = 0; i < block_types; ++i) {

+  for (i = 0; i < BLOCK_TYPES; ++i) {

     for (j = 0; j < REF_TYPES; ++j) {

       for (k = 0; k < COEF_BANDS; ++k) {

         // int prev_coef_savings[ENTROPY_NODES] = {0};

         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

           // calc probs and branch cts for this frame only

-          for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) {

+          for (t = tstart; t < entropy_nodes_update; ++t) {

             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];

             vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;

             const vp9_prob upd = vp9_coef_update_prob[t];

-            int s;  // = prev_coef_savings[t];

+            int s;

             int u = 0;

             if (l >= 3 && k == 0)

               continue;

-#if defined(SEARCH_NEWP)

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

-            if (t == UNCONSTRAINED_NODES - 1)

+            if (t == PIVOT_NODE)

               s = prob_diff_update_savings_search_model(

                   frame_branch_ct[i][j][k][l][0],

                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);

             else

-#endif

               s = prob_diff_update_savings_search(

                   frame_branch_ct[i][j][k][l][t],

                   *oldp, &newp, upd);

             if (s > 0 && newp != *oldp)

               u = 1;

-#else

-            s = prob_update_savings(frame_branch_ct[i][j][k][l][t],

-                                    *oldp, newp, upd);

-            if (s > 0)

-              u = 1;

-#endif

             vp9_write(bc, u, upd);

 #ifdef ENTROPY_STATS

             if (!cpi->dummy_packing)

-              ++tree_update_hist[i][j][k][l][t][u];

+              ++tree_update_hist[tx_size][i][j][k][l][t][u];

 #endif

             if (u) {

               /* send/use new probability */

               write_prob_diff_update(bc, newp, *oldp);

               *oldp = newp;

-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

-              if (t == UNCONSTRAINED_NODES - 1)

-                vp9_get_model_distribution(

-                    newp, old_frame_coef_probs[i][j][k][l], i, j);

-#endif

@@ -2188,738 +1166,565 @@

 static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {

+  const TXFM_MODE txfm_mode = cpi->common.txfm_mode;

   vp9_clear_system_state();

   // Build the cofficient contexts based on counts collected in encode loop

   build_coeff_contexts(cpi);

-  update_coef_probs_common(bc,

-#ifdef ENTROPY_STATS

-                           cpi,

-                           tree_update_hist_4x4,

-#endif

-                           cpi->frame_coef_probs_4x4,

-                           cpi->common.fc.coef_probs_4x4,

-                           cpi->frame_branch_ct_4x4,

-                           BLOCK_TYPES);

+  update_coef_probs_common(bc, cpi, TX_4X4);

-  /* do not do this if not even allowed */

-  if (cpi->common.txfm_mode != ONLY_4X4) {

-    update_coef_probs_common(bc,

-#ifdef ENTROPY_STATS

-                             cpi,

-                             tree_update_hist_8x8,

-#endif

-                             cpi->frame_coef_probs_8x8,

-                             cpi->common.fc.coef_probs_8x8,

-                             cpi->frame_branch_ct_8x8,

-                             BLOCK_TYPES);

-  }

+  // do not do this if not even allowed

+  if (txfm_mode > ONLY_4X4)

+    update_coef_probs_common(bc, cpi, TX_8X8);

-  if (cpi->common.txfm_mode > ALLOW_8X8) {

-    update_coef_probs_common(bc,

-#ifdef ENTROPY_STATS

-                             cpi,

-                             tree_update_hist_16x16,

-#endif

-                             cpi->frame_coef_probs_16x16,

-                             cpi->common.fc.coef_probs_16x16,

-                             cpi->frame_branch_ct_16x16,

-                             BLOCK_TYPES);

-  }

+  if (txfm_mode > ALLOW_8X8)

+    update_coef_probs_common(bc, cpi, TX_16X16);

-  if (cpi->common.txfm_mode > ALLOW_16X16) {

-    update_coef_probs_common(bc,

-#ifdef ENTROPY_STATS

-                             cpi,

-                             tree_update_hist_32x32,

-#endif

-                             cpi->frame_coef_probs_32x32,

-                             cpi->common.fc.coef_probs_32x32,

-                             cpi->frame_branch_ct_32x32,

-                             BLOCK_TYPES);

-  }

+  if (txfm_mode > ALLOW_16X16)

+    update_coef_probs_common(bc, cpi, TX_32X32);

-#ifdef PACKET_TESTING

-FILE *vpxlogc = 0;

-#endif

+static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,

+                              struct vp9_write_bit_buffer *wb) {

+  int i;

-static void put_delta_q(vp9_writer *bc, int delta_q) {

-  if (delta_q != 0) {

-    vp9_write_bit(bc, 1);

-    vp9_write_literal(bc, abs(delta_q), 4);

+  // Encode the loop filter level and type

+  vp9_wb_write_literal(wb, pc->filter_level, 6);

+  vp9_wb_write_literal(wb, pc->sharpness_level, 3);

-    if (delta_q < 0)

-      vp9_write_bit(bc, 1);

-    else

-      vp9_write_bit(bc, 0);

-  } else

-    vp9_write_bit(bc, 0);

-}

+  // Write out loop filter deltas applied at the MB level based on mode or

+  // ref frame (if they are enabled).

+  vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_enabled);

-static void decide_kf_ymode_entropy(VP9_COMP *cpi) {

+  if (xd->mode_ref_lf_delta_enabled) {

+    // Do the deltas need to be updated

+    vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_update);

+    if (xd->mode_ref_lf_delta_update) {

+      // Send update

+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

+        const int delta = xd->ref_lf_deltas[i];

-  int mode_cost[MB_MODE_COUNT];

-  int cost;

-  int bestcost = INT_MAX;

-  int bestindex = 0;

-  int i, j;

+        // Frame level data

+        if (delta != xd->last_ref_lf_deltas[i]) {

+          xd->last_ref_lf_deltas[i] = delta;

+          vp9_wb_write_bit(wb, 1);

-  for (i = 0; i < 8; i++) {

-    vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);

-    cost = 0;

-    for (j = 0; j < VP9_YMODES; j++) {

-      cost += mode_cost[j] * cpi->ymode_count[j];

+          assert(delta != 0);

+          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);

+          vp9_wb_write_bit(wb, delta < 0);

+        } else {

+          vp9_wb_write_bit(wb, 0);

+        }

+      }

+      // Send update

+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

+        const int delta = xd->mode_lf_deltas[i];

+        if (delta != xd->last_mode_lf_deltas[i]) {

+          xd->last_mode_lf_deltas[i] = delta;

+          vp9_wb_write_bit(wb, 1);

+          assert(delta != 0);

+          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);

+          vp9_wb_write_bit(wb, delta < 0);

+        } else {

+          vp9_wb_write_bit(wb, 0);

+        }

+      }

-    vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],

-                    vp9_sb_ymode_tree);

-    for (j = 0; j < VP9_I32X32_MODES; j++) {

-      cost += mode_cost[j] * cpi->sb_ymode_count[j];

-    }

-    if (cost < bestcost) {

-      bestindex = i;

-      bestcost = cost;

-    }

-  cpi->common.kf_ymode_probs_index = bestindex;

-static void segment_reference_frames(VP9_COMP *cpi) {

-  VP9_COMMON *oci = &cpi->common;

-  MODE_INFO *mi = oci->mi;

-  int ref[MAX_MB_SEGMENTS] = {0};

-  int i, j;

-  int mb_index = 0;

-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  for (i = 0; i < oci->mb_rows; i++) {

-    for (j = 0; j < oci->mb_cols; j++, mb_index++) {

-      ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);

-    }

-    mb_index++;

+static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) {

+  if (delta_q != 0) {

+    vp9_wb_write_bit(wb, 1);

+    vp9_wb_write_literal(wb, abs(delta_q), 4);

+    vp9_wb_write_bit(wb, delta_q < 0);

+  } else {

+    vp9_wb_write_bit(wb, 0);

-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-    vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);

-    vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);

-  }

-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,

-                        unsigned long *size) {

+static void encode_quantization(VP9_COMMON *cm,

+                                struct vp9_write_bit_buffer *wb) {

+  vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);

+  write_delta_q(wb, cm->y_dc_delta_q);

+  write_delta_q(wb, cm->uv_dc_delta_q);

+  write_delta_q(wb, cm->uv_ac_delta_q);

+}

+static void encode_segmentation(VP9_COMP *cpi,

+                               struct vp9_write_bit_buffer *wb) {

   int i, j;

-  VP9_HEADER oh;

-  VP9_COMMON *const pc = &cpi->common;

-  vp9_writer header_bc, residual_bc;

+  VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  int extra_bytes_packed = 0;

-  unsigned char *cx_data = dest;

+  vp9_wb_write_bit(wb, xd->segmentation_enabled);

+  if (!xd->segmentation_enabled)

+    return;

-  oh.show_frame = (int) pc->show_frame;

-  oh.type = (int)pc->frame_type;

-  oh.version = pc->version;

-  oh.first_partition_length_in_bytes = 0;

-  cx_data += 3;

-#if defined(SECTIONBITS_OUTPUT)

-  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;

-#endif

-  compute_update_table();

-  /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once

-   * for each K frame before encode frame. pc->kf_bmode_prob doesn't get

-   * changed anywhere else. No need to call it again here. --yw

-   * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);

-   */

-  /* every keyframe send startcode, width, height, scale factor, clamp

-   * and color type.

-   */

-  if (oh.type == KEY_FRAME) {

-    // Start / synch code

-    cx_data[0] = 0x9D;

-    cx_data[1] = 0x01;

-    cx_data[2] = 0x2a;

-    extra_bytes_packed = 3;

-    cx_data += extra_bytes_packed;

-  }

-  {

-    int v;

-    if (pc->width != pc->display_width || pc->height != pc->display_height) {

-      v = pc->display_width;

-      cx_data[0] = v;

-      cx_data[1] = v >> 8;

-      v = pc->display_height;

-      cx_data[2] = v;

-      cx_data[3] = v >> 8;

-      cx_data += 4;

-      extra_bytes_packed += 4;

+  // Segmentation map

+  vp9_wb_write_bit(wb, xd->update_mb_segmentation_map);

+  if (xd->update_mb_segmentation_map) {

+    // Select the coding strategy (temporal or spatial)

+    vp9_choose_segmap_coding_method(cpi);

+    // Write out probabilities used to decode unpredicted  macro-block segments

+    for (i = 0; i < MB_SEG_TREE_PROBS; i++) {

+      const int prob = xd->mb_segment_tree_probs[i];

+      const int update = prob != MAX_PROB;

+      vp9_wb_write_bit(wb, update);

+      if (update)

+        vp9_wb_write_literal(wb, prob, 8);

-    v = pc->width;

-    cx_data[0] = v;

-    cx_data[1] = v >> 8;

-    v = pc->height;

-    cx_data[2] = v;

-    cx_data[3] = v >> 8;

-    extra_bytes_packed += 4;

-    cx_data += 4;

+    // Write out the chosen coding method.

+    vp9_wb_write_bit(wb, cm->temporal_update);

+    if (cm->temporal_update) {

+      for (i = 0; i < PREDICTION_PROBS; i++) {

+        const int prob = cm->segment_pred_probs[i];

+        const int update = prob != MAX_PROB;

+        vp9_wb_write_bit(wb, update);

+        if (update)

+          vp9_wb_write_literal(wb, prob, 8);

+      }

+    }

-  vp9_start_encode(&header_bc, cx_data);

+  // Segmentation data

+  vp9_wb_write_bit(wb, xd->update_mb_segmentation_data);

+  if (xd->update_mb_segmentation_data) {

+    vp9_wb_write_bit(wb, xd->mb_segment_abs_delta);

-  // TODO(jkoleszar): remove these two unused bits?

-  vp9_write_bit(&header_bc, pc->clr_type);

-  vp9_write_bit(&header_bc, pc->clamp_type);

+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+      for (j = 0; j < SEG_LVL_MAX; j++) {

+        const int active = vp9_segfeature_active(xd, i, j);

+        vp9_wb_write_bit(wb, active);

+        if (active) {

+          const int data = vp9_get_segdata(xd, i, j);

+          const int data_max = vp9_seg_feature_data_max(j);

-  // error resilient mode

-  vp9_write_bit(&header_bc, pc->error_resilient_mode);

-  // Signal whether or not Segmentation is enabled

-  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);

-  // Indicate which features are enabled

-  if (xd->segmentation_enabled) {

-    // Indicate whether or not the segmentation map is being updated.

-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);

-    // If it is, then indicate the method that will be used.

-    if (xd->update_mb_segmentation_map) {

-      // Select the coding strategy (temporal or spatial)

-      vp9_choose_segmap_coding_method(cpi);

-      // Send the tree probabilities used to decode unpredicted

-      // macro-block segments

-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

-        int data = xd->mb_segment_tree_probs[i];

-        if (data != 255) {

-          vp9_write_bit(&header_bc, 1);

-          vp9_write_literal(&header_bc, data, 8);

-        } else {

-          vp9_write_bit(&header_bc, 0);

-        }

-      }

-      // Write out the chosen coding method.

-      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);

-      if (pc->temporal_update) {

-        for (i = 0; i < PREDICTION_PROBS; i++) {

-          int data = pc->segment_pred_probs[i];

-          if (data != 255) {

-            vp9_write_bit(&header_bc, 1);

-            vp9_write_literal(&header_bc, data, 8);

+          if (vp9_is_segfeature_signed(j)) {

+            vp9_encode_unsigned_max(wb, abs(data), data_max);

+            vp9_wb_write_bit(wb, data < 0);

           } else {

-            vp9_write_bit(&header_bc, 0);

+            vp9_encode_unsigned_max(wb, data, data_max);

+  }

+}

-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);

-    // segment_reference_frames(cpi);

+static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {

+  VP9_COMMON *const cm = &cpi->common;

-    if (xd->update_mb_segmentation_data) {

-      signed char Data;

+  // Mode

+  vp9_write_literal(w, MIN(cm->txfm_mode, ALLOW_32X32), 2);

+  if (cm->txfm_mode >= ALLOW_32X32)

+    vp9_write_bit(w, cm->txfm_mode == TX_MODE_SELECT);

-      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);

+  // Probabilities

+  if (cm->txfm_mode == TX_MODE_SELECT) {

+    int i, j;

+    unsigned int ct_8x8p[TX_SIZE_MAX_SB - 3][2];

+    unsigned int ct_16x16p[TX_SIZE_MAX_SB - 2][2];

+    unsigned int ct_32x32p[TX_SIZE_MAX_SB - 1][2];

-      // For each segments id...

-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-        // For each segmentation codable feature...

-        for (j = 0; j < SEG_LVL_MAX; j++) {

-          Data = vp9_get_segdata(xd, i, j);

-          // If the feature is enabled...

-          if (vp9_segfeature_active(xd, i, j)) {

-            vp9_write_bit(&header_bc, 1);

-            // Is the segment data signed..

-            if (vp9_is_segfeature_signed(j)) {

-              // Encode the relevant feature data

-              if (Data < 0) {

-                Data = - Data;

-                vp9_encode_unsigned_max(&header_bc, Data,

-                                        vp9_seg_feature_data_max(j));

-                vp9_write_bit(&header_bc, 1);

-              } else {

-                vp9_encode_unsigned_max(&header_bc, Data,

-                                        vp9_seg_feature_data_max(j));

-                vp9_write_bit(&header_bc, 0);

-              }

-            }

-            // Unsigned data element so no sign bit needed

-            else

-              vp9_encode_unsigned_max(&header_bc, Data,

-                                      vp9_seg_feature_data_max(j));

-          } else

-            vp9_write_bit(&header_bc, 0);

-        }

+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+      tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],

+                                     ct_8x8p);

+      for (j = 0; j < TX_SIZE_MAX_SB - 3; j++) {

+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_8x8p[i][j],

+                                  VP9_MODE_UPDATE_PROB, ct_8x8p[j]);

-  }

-  // Encode the common prediction model status flag probability updates for

-  // the reference frame

-  update_refpred_stats(cpi);

-  if (pc->frame_type != KEY_FRAME) {

-    for (i = 0; i < PREDICTION_PROBS; i++) {

-      if (cpi->ref_pred_probs_update[i]) {

-        vp9_write_bit(&header_bc, 1);

-        vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);

-      } else {

-        vp9_write_bit(&header_bc, 0);

+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+      tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],

+                                       ct_16x16p);

+      for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {

+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_16x16p[i][j],

+                                  VP9_MODE_UPDATE_PROB, ct_16x16p[j]);

+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {

+      tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],

+                                       ct_32x32p);

+      for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {

+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_32x32p[i][j],

+                                  VP9_MODE_UPDATE_PROB, ct_32x32p[j]);

+      }

+    }

+#ifdef MODE_STATS

+    if (!cpi->dummy_packing)

+      update_tx_count_stats(cm);

+#endif

+}

-  pc->sb64_coded = get_binary_prob(cpi->sb64_count[0], cpi->sb64_count[1]);

-  vp9_write_literal(&header_bc, pc->sb64_coded, 8);

-  pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]);

-  vp9_write_literal(&header_bc, pc->sb32_coded, 8);

+static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,

+                                     struct vp9_write_bit_buffer *wb) {

+  vp9_wb_write_bit(wb, type == SWITCHABLE);

+  if (type != SWITCHABLE)

+    vp9_wb_write_literal(wb, type, 2);

+}

-  vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);

-  if (cpi->mb.e_mbd.lossless) {

-    pc->txfm_mode = ONLY_4X4;

-  } else {

-    if (pc->txfm_mode == TX_MODE_SELECT) {

-      pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +

-                                cpi->txfm_count_16x16p[TX_4X4] +

-                                cpi->txfm_count_8x8p[TX_4X4],

-                                cpi->txfm_count_32x32p[TX_4X4] +

-                                cpi->txfm_count_32x32p[TX_8X8] +

-                                cpi->txfm_count_32x32p[TX_16X16] +

-                                cpi->txfm_count_32x32p[TX_32X32] +

-                                cpi->txfm_count_16x16p[TX_4X4] +

-                                cpi->txfm_count_16x16p[TX_8X8] +

-                                cpi->txfm_count_16x16p[TX_16X16] +

-                                cpi->txfm_count_8x8p[TX_4X4] +

-                                cpi->txfm_count_8x8p[TX_8X8]);

-      pc->prob_tx[1] = get_prob(cpi->txfm_count_32x32p[TX_8X8] +

-                                cpi->txfm_count_16x16p[TX_8X8],

-                                cpi->txfm_count_32x32p[TX_8X8] +

-                                cpi->txfm_count_32x32p[TX_16X16] +

-                                cpi->txfm_count_32x32p[TX_32X32] +

-                                cpi->txfm_count_16x16p[TX_8X8] +

-                                cpi->txfm_count_16x16p[TX_16X16]);

-      pc->prob_tx[2] = get_prob(cpi->txfm_count_32x32p[TX_16X16],

-                                cpi->txfm_count_32x32p[TX_16X16] +

-                                cpi->txfm_count_32x32p[TX_32X32]);

-    } else {

-      pc->prob_tx[0] = 128;

-      pc->prob_tx[1] = 128;

-      pc->prob_tx[2] = 128;

+static void fix_mcomp_filter_type(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  if (cm->mcomp_filter_type == SWITCHABLE) {

+    // Check to see if only one of the filters is actually used

+    int count[VP9_SWITCHABLE_FILTERS];

+    int i, j, c = 0;

+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

+      count[i] = 0;

+      for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)

+        count[i] += cm->fc.switchable_interp_count[j][i];

+      c += (count[i] > 0);

-    vp9_write_literal(&header_bc, pc->txfm_mode <= 3 ? pc->txfm_mode : 3, 2);

-    if (pc->txfm_mode > ALLOW_16X16) {

-      vp9_write_bit(&header_bc, pc->txfm_mode == TX_MODE_SELECT);

+    if (c == 1) {

+      // Only one filter is used. So set the filter at frame level

+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

+        if (count[i]) {

+          cm->mcomp_filter_type = vp9_switchable_interp[i];

+          break;

+        }

+      }

-    if (pc->txfm_mode == TX_MODE_SELECT) {

-      vp9_write_literal(&header_bc, pc->prob_tx[0], 8);

-      vp9_write_literal(&header_bc, pc->prob_tx[1], 8);

-      vp9_write_literal(&header_bc, pc->prob_tx[2], 8);

-    }

+}

-  // Encode the loop filter level and type

-  vp9_write_bit(&header_bc, pc->filter_type);

-  vp9_write_literal(&header_bc, pc->filter_level, 6);

-  vp9_write_literal(&header_bc, pc->sharpness_level, 3);

-#if CONFIG_LOOP_DERING

-  if (pc->dering_enabled) {

-    vp9_write_bit(&header_bc, 1);

-    vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);

-  } else {

-    vp9_write_bit(&header_bc, 0);

+static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {

+  int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;

+  vp9_get_tile_n_bits(cm, &min_log2_tiles, &delta_log2_tiles);

+  n_tile_bits = cm->log2_tile_columns - min_log2_tiles;

+  for (n = 0; n < delta_log2_tiles; n++) {

+    if (n_tile_bits--) {

+      vp9_wb_write_bit(wb, 1);

+    } else {

+      vp9_wb_write_bit(wb, 0);

+      break;

+    }

-#endif

-  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).

-  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);

+  vp9_wb_write_bit(wb, cm->log2_tile_rows != 0);

+  if (cm->log2_tile_rows != 0)

+    vp9_wb_write_bit(wb, cm->log2_tile_rows != 1);

+}

-  if (xd->mode_ref_lf_delta_enabled) {

-    // Do the deltas need to be updated

-    int send_update = xd->mode_ref_lf_delta_update;

+static int get_refresh_mask(VP9_COMP *cpi) {

+    // Should the GF or ARF be updated using the transmitted frame or buffer

+#if CONFIG_MULTIPLE_ARF

+    if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&

+        !cpi->refresh_alt_ref_frame) {

+#else

+    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

+#endif

+      // Preserve the previously existing golden frame and update the frame in

+      // the alt ref slot instead. This is highly specific to the use of

+      // alt-ref as a forward reference, and this needs to be generalized as

+      // other uses are implemented (like RTC/temporal scaling)

+      //

+      // gld_fb_idx and alt_fb_idx need to be swapped for future frames, but

+      // that happens in vp9_onyx_if.c:update_reference_frames() so that it can

+      // be done outside of the recode loop.

+      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |

+             (cpi->refresh_golden_frame << cpi->alt_fb_idx);

+    } else {

+      int arf_idx = cpi->alt_fb_idx;

+#if CONFIG_MULTIPLE_ARF

+      // Determine which ARF buffer to use to encode this ARF frame.

+      if (cpi->multi_arf_enabled) {

+        int sn = cpi->sequence_number;

+        arf_idx = (cpi->frame_coding_order[sn] < 0) ?

+            cpi->arf_buffer_idx[sn + 1] :

+            cpi->arf_buffer_idx[sn];

+      }

+#endif

+      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |

+             (cpi->refresh_golden_frame << cpi->gld_fb_idx) |

+             (cpi->refresh_alt_ref_frame << arf_idx);

+    }

+}

-    vp9_write_bit(&header_bc, send_update);

-    if (send_update) {

-      int Data;

+static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) {

+  VP9_COMMON *const cm = &cpi->common;

-      // Send update

-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

-        Data = xd->ref_lf_deltas[i];

+  const int scaling_active = cm->width != cm->display_width ||

+                             cm->height != cm->display_height;

+  vp9_wb_write_bit(wb, scaling_active);

+  if (scaling_active) {

+    vp9_wb_write_literal(wb, cm->display_width - 1, 16);

+    vp9_wb_write_literal(wb, cm->display_height - 1, 16);

+  }

+}

-        // Frame level data

-        if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {

-          xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];

-          vp9_write_bit(&header_bc, 1);

+static void write_frame_size(VP9_COMP *cpi,

+                             struct vp9_write_bit_buffer *wb) {

+  VP9_COMMON *const cm = &cpi->common;

+  vp9_wb_write_literal(wb, cm->width - 1, 16);

+  vp9_wb_write_literal(wb, cm->height - 1, 16);

-          if (Data > 0) {

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 0);    // sign

-          } else {

-            Data = -Data;

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 1);    // sign

-          }

-        } else {

-          vp9_write_bit(&header_bc, 0);

-        }

-      }

+  write_display_size(cpi, wb);

+}

-      // Send update

-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

-        Data = xd->mode_lf_deltas[i];

+static void write_frame_size_with_refs(VP9_COMP *cpi,

+                                       struct vp9_write_bit_buffer *wb) {

+  VP9_COMMON *const cm = &cpi->common;

+  int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,

+                                      cpi->alt_fb_idx};

+  int i, found = 0;

-        if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {

-          xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];

-          vp9_write_bit(&header_bc, 1);

+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {

+    YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];

+    found = cm->width == cfg->y_crop_width &&

+            cm->height == cfg->y_crop_height;

+    vp9_wb_write_bit(wb, found);

+    if (found)

+      break;

+  }

-          if (Data > 0) {

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 0);    // sign

-          } else {

-            Data = -Data;

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 1);    // sign

-          }

-        } else {

-          vp9_write_bit(&header_bc, 0);

-        }

-      }

-    }

+  if (!found) {

+    vp9_wb_write_literal(wb, cm->width - 1, 16);

+    vp9_wb_write_literal(wb, cm->height - 1, 16);

-  // signal here is multi token partition is enabled

-  // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);

-  vp9_write_literal(&header_bc, 0, 2);

+  write_display_size(cpi, wb);

+}

-  // Frame Q baseline quantizer index

-  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);

+static void write_sync_code(struct vp9_write_bit_buffer *wb) {

+  vp9_wb_write_literal(wb, SYNC_CODE_0, 8);

+  vp9_wb_write_literal(wb, SYNC_CODE_1, 8);

+  vp9_wb_write_literal(wb, SYNC_CODE_2, 8);

+}

-  // Transmit Dc, Second order and Uv quantizer delta information

-  put_delta_q(&header_bc, pc->y1dc_delta_q);

-  put_delta_q(&header_bc, pc->uvdc_delta_q);

-  put_delta_q(&header_bc, pc->uvac_delta_q);

+static void write_uncompressed_header(VP9_COMP *cpi,

+                                      struct vp9_write_bit_buffer *wb) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  // When there is a key frame all reference buffers are updated using the new key frame

-  if (pc->frame_type != KEY_FRAME) {

-    int refresh_mask;

+  // frame marker bits

+  vp9_wb_write_literal(wb, 0x2, 2);

-    // Should the GF or ARF be updated using the transmitted frame or buffer

-    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

-      /* Preserve the previously existing golden frame and update the frame in

-       * the alt ref slot instead. This is highly specific to the use of

-       * alt-ref as a forward reference, and this needs to be generalized as

-       * other uses are implemented (like RTC/temporal scaling)

-       *

-       * gld_fb_idx and alt_fb_idx need to be swapped for future frames, but

-       * that happens in vp9_onyx_if.c:update_reference_frames() so that it can

-       * be done outside of the recode loop.

-       */

-      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |

-                     (cpi->refresh_golden_frame << cpi->alt_fb_idx);

+  // bitstream version.

+  // 00 - profile 0. 4:2:0 only

+  // 10 - profile 1. adds 4:4:4, 4:2:2, alpha

+  vp9_wb_write_bit(wb, cm->version);

+  vp9_wb_write_bit(wb, 0);

+  vp9_wb_write_bit(wb, 0);

+  vp9_wb_write_bit(wb, cm->frame_type);

+  vp9_wb_write_bit(wb, cm->show_frame);

+  vp9_wb_write_bit(wb, cm->error_resilient_mode);

+  if (cm->frame_type == KEY_FRAME) {

+    write_sync_code(wb);

+    // colorspaces

+    // 000 - Unknown

+    // 001 - BT.601

+    // 010 - BT.709

+    // 011 - SMPTE-170

+    // 100 - SMPTE-240

+    // 101 - Reserved

+    // 110 - Reserved

+    // 111 - sRGB (RGB)

+    vp9_wb_write_literal(wb, 0, 3);

+    if (1 /* colorspace != sRGB */) {

+      vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]

+      if (cm->version == 1) {

+        vp9_wb_write_bit(wb, cm->subsampling_x);

+        vp9_wb_write_bit(wb, cm->subsampling_y);

+        vp9_wb_write_bit(wb, 0);  // has extra plane

+      }

     } else {

-      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |

-                     (cpi->refresh_golden_frame << cpi->gld_fb_idx) |

-                     (cpi->refresh_alt_ref_frame << cpi->alt_fb_idx);

+      assert(cm->version == 1);

+      vp9_wb_write_bit(wb, 0);  // has extra plane

-    vp9_write_literal(&header_bc, refresh_mask, NUM_REF_FRAMES);

-    vp9_write_literal(&header_bc, cpi->lst_fb_idx, NUM_REF_FRAMES_LG2);

-    vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2);

-    vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2);

-    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)

-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);

-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);

+    write_frame_size(cpi, wb);

+  } else {

+    const int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,

+                                              cpi->alt_fb_idx};

+    if (!cm->show_frame)

+      vp9_wb_write_bit(wb, cm->intra_only);

-    // Signal whether to allow high MV precision

-    vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);

-    if (pc->mcomp_filter_type == SWITCHABLE) {

-      /* Check to see if only one of the filters is actually used */

-      int count[VP9_SWITCHABLE_FILTERS];

-      int i, j, c = 0;

-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

-        count[i] = 0;

-        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

-          count[i] += cpi->switchable_interp_count[j][i];

-        }

-        c += (count[i] > 0);

+    if (!cm->error_resilient_mode)

+      vp9_wb_write_literal(wb, cm->reset_frame_context, 2);

+    if (cm->intra_only) {

+      write_sync_code(wb);

+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);

+      write_frame_size(cpi, wb);

+    } else {

+      int i;

+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);

+      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {

+        vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LG2);

+        vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]);

-      if (c == 1) {

-        /* Only one filter is used. So set the filter at frame level */

-        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

-          if (count[i]) {

-            pc->mcomp_filter_type = vp9_switchable_interp[i];

-            break;

-          }

-        }

-      }

+      write_frame_size_with_refs(cpi, wb);

+      vp9_wb_write_bit(wb, xd->allow_high_precision_mv);

+      fix_mcomp_filter_type(cpi);

+      write_interp_filter_type(cm->mcomp_filter_type, wb);

-    // Signal the type of subpel filter to use

-    vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));

-    if (pc->mcomp_filter_type != SWITCHABLE)

-      vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);

-#if CONFIG_COMP_INTERINTRA_PRED

-    //  printf("Counts: %d %d\n", cpi->interintra_count[0],

-    //         cpi->interintra_count[1]);

-    if (!cpi->dummy_packing && pc->use_interintra)

-      pc->use_interintra = (cpi->interintra_count[1] > 0);

-    vp9_write_bit(&header_bc, pc->use_interintra);

-    if (!pc->use_interintra)

-      vp9_zero(cpi->interintra_count);

-#endif

-  if (!pc->error_resilient_mode) {

-    vp9_write_bit(&header_bc, pc->refresh_entropy_probs);

-    vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode);

+  if (!cm->error_resilient_mode) {

+    vp9_wb_write_bit(wb, cm->refresh_frame_context);

+    vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);

-  vp9_write_literal(&header_bc, pc->frame_context_idx,

-                    NUM_FRAME_CONTEXTS_LG2);

+  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LG2);

-#ifdef ENTROPY_STATS

-  if (pc->frame_type == INTER_FRAME)

-    active_section = 0;

-  else

-    active_section = 7;

-#endif

+  encode_loopfilter(cm, xd, wb);

+  encode_quantization(cm, wb);

+  encode_segmentation(cpi, wb);

-  // If appropriate update the inter mode probability context and code the

-  // changes in the bitstream.

-  if (pc->frame_type != KEY_FRAME) {

-    int i, j;

-    int new_context[INTER_MODE_CONTEXTS][4];

-    if (!cpi->dummy_packing) {

-      update_inter_mode_probs(pc, new_context);

-    } else {

-      // In dummy pack assume context unchanged.

-      vpx_memcpy(new_context, pc->fc.vp9_mode_contexts,

-                 sizeof(pc->fc.vp9_mode_contexts));

-    }

+  write_tile_info(cm, wb);

+}

-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {

-      for (j = 0; j < 4; j++) {

-        if (new_context[i][j] != pc->fc.vp9_mode_contexts[i][j]) {

-          vp9_write(&header_bc, 1, 252);

-          vp9_write_literal(&header_bc, new_context[i][j], 8);

+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {

+  int i, bytes_packed;

+  VP9_COMMON *const pc = &cpi->common;

+  vp9_writer header_bc, residual_bc;

+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-          // Only update the persistent copy if this is the "real pack"

-          if (!cpi->dummy_packing) {

-            pc->fc.vp9_mode_contexts[i][j] = new_context[i][j];

-          }

-        } else {

-          vp9_write(&header_bc, 0, 252);

-        }

-      }

-    }

-  }

+  uint8_t *cx_data = dest;

+  struct vp9_write_bit_buffer wb = {dest, 0};

+  struct vp9_write_bit_buffer first_partition_size_wb;

-#if CONFIG_NEW_MVREF

-  if ((pc->frame_type != KEY_FRAME)) {

-    int new_mvref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];

-    int i, j;

+  write_uncompressed_header(cpi, &wb);

+  first_partition_size_wb = wb;

+  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size

-    update_mv_ref_probs(cpi, new_mvref_probs);

+  bytes_packed = vp9_rb_bytes_written(&wb);

+  cx_data += bytes_packed;

-    for (i = 0; i < MAX_REF_FRAMES; ++i) {

-      // Skip the dummy entry for intra ref frame.

-      if (i == INTRA_FRAME) {

-        continue;

-      }

+  compute_update_table();

-      // Encode any mandated updates to probabilities

-      for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {

-        if (new_mvref_probs[i][j] != xd->mb_mv_ref_probs[i][j]) {

-          vp9_write(&header_bc, 1, VP9_MVREF_UPDATE_PROB);

-          vp9_write_literal(&header_bc, new_mvref_probs[i][j], 8);

+  vp9_start_encode(&header_bc, cx_data);

-          // Only update the persistent copy if this is the "real pack"

-          if (!cpi->dummy_packing) {

-            xd->mb_mv_ref_probs[i][j] = new_mvref_probs[i][j];

-          }

-        } else {

-          vp9_write(&header_bc, 0, VP9_MVREF_UPDATE_PROB);

-        }

-      }

-    }

-  }

+#ifdef ENTROPY_STATS

+  if (pc->frame_type == INTER_FRAME)

+    active_section = 0;

+  else

+    active_section = 7;

 #endif

   vp9_clear_system_state();  // __asm emms;

-  vp9_copy(cpi->common.fc.pre_coef_probs_4x4,

-           cpi->common.fc.coef_probs_4x4);

-  vp9_copy(cpi->common.fc.pre_coef_probs_8x8,

-           cpi->common.fc.coef_probs_8x8);

-  vp9_copy(cpi->common.fc.pre_coef_probs_16x16,

-           cpi->common.fc.coef_probs_16x16);

-  vp9_copy(cpi->common.fc.pre_coef_probs_32x32,

-           cpi->common.fc.coef_probs_32x32);

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_copy(cpi->common.fc.pre_nzc_probs_4x4,

-           cpi->common.fc.nzc_probs_4x4);

-  vp9_copy(cpi->common.fc.pre_nzc_probs_8x8,

-           cpi->common.fc.nzc_probs_8x8);

-  vp9_copy(cpi->common.fc.pre_nzc_probs_16x16,

-           cpi->common.fc.nzc_probs_16x16);

-  vp9_copy(cpi->common.fc.pre_nzc_probs_32x32,

-           cpi->common.fc.nzc_probs_32x32);

-  vp9_copy(cpi->common.fc.pre_nzc_pcat_probs,

-           cpi->common.fc.nzc_pcat_probs);

-  // NOTE that if the counts are reset, we also need to uncomment

-  // the count updates in the write_nzc function

-  /*

-  vp9_zero(cpi->common.fc.nzc_counts_4x4);

-  vp9_zero(cpi->common.fc.nzc_counts_8x8);

-  vp9_zero(cpi->common.fc.nzc_counts_16x16);

-  vp9_zero(cpi->common.fc.nzc_counts_32x32);

-  vp9_zero(cpi->common.fc.nzc_pcat_counts);

-  */

-#endif

-  vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);

-  vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);

-  vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);

-  vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);

-  vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);

-  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);

-  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);

-  cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;

-#if CONFIG_COMP_INTERINTRA_PRED

-  cpi->common.fc.pre_interintra_prob = cpi->common.fc.interintra_prob;

-#endif

-  vp9_zero(cpi->sub_mv_ref_count);

-  vp9_zero(cpi->mbsplit_count);

-  vp9_zero(cpi->common.fc.mv_ref_ct)

+  vp9_copy(pc->fc.pre_coef_probs, pc->fc.coef_probs);

+  vp9_copy(pc->fc.pre_y_mode_prob, pc->fc.y_mode_prob);

+  vp9_copy(pc->fc.pre_uv_mode_prob, pc->fc.uv_mode_prob);

+  vp9_copy(pc->fc.pre_partition_prob, pc->fc.partition_prob[INTER_FRAME]);

+  pc->fc.pre_nmvc = pc->fc.nmvc;

+  vp9_copy(pc->fc.pre_switchable_interp_prob, pc->fc.switchable_interp_prob);

+  vp9_copy(pc->fc.pre_inter_mode_probs, pc->fc.inter_mode_probs);

+  vp9_copy(pc->fc.pre_intra_inter_prob, pc->fc.intra_inter_prob);

+  vp9_copy(pc->fc.pre_comp_inter_prob, pc->fc.comp_inter_prob);

+  vp9_copy(pc->fc.pre_comp_ref_prob, pc->fc.comp_ref_prob);

+  vp9_copy(pc->fc.pre_single_ref_prob, pc->fc.single_ref_prob);

+  vp9_copy(pc->fc.pre_tx_probs_8x8p, pc->fc.tx_probs_8x8p);

+  vp9_copy(pc->fc.pre_tx_probs_16x16p, pc->fc.tx_probs_16x16p);

+  vp9_copy(pc->fc.pre_tx_probs_32x32p, pc->fc.tx_probs_32x32p);

+  vp9_copy(pc->fc.pre_mbskip_probs, pc->fc.mbskip_probs);

+  if (xd->lossless) {

+    pc->txfm_mode = ONLY_4X4;

+  } else {

+    encode_txfm_probs(cpi, &header_bc);

+  }

   update_coef_probs(cpi, &header_bc);

-#if CONFIG_CODE_NONZEROCOUNT

-  update_nzc_probs(cpi, &header_bc);

-#endif

 #ifdef ENTROPY_STATS

   active_section = 2;

 #endif

-  // Write out the mb_no_coeff_skip flag

-  vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);

-  if (pc->mb_no_coeff_skip) {

-    int k;

+  vp9_update_skip_probs(cpi, &header_bc);

-    vp9_update_skip_probs(cpi);

-    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

-      vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);

-    }

-  }

-  if (pc->frame_type == KEY_FRAME) {

-    if (!pc->kf_ymode_probs_update) {

-      vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);

-    }

-  } else {

-    // Update the probabilities used to encode reference frame data

-    update_ref_probs(cpi);

+  if (pc->frame_type != KEY_FRAME) {

 #ifdef ENTROPY_STATS

     active_section = 1;

 #endif

+    update_inter_mode_probs(pc, &header_bc);

+    vp9_zero(cpi->common.fc.inter_mode_counts);

     if (pc->mcomp_filter_type == SWITCHABLE)

       update_switchable_interp_probs(cpi, &header_bc);

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (pc->use_interintra) {

-      vp9_cond_prob_update(&header_bc,

-                           &pc->fc.interintra_prob,

-                           VP9_UPD_INTERINTRA_PROB,

-                           cpi->interintra_count);

-    }

-#endif

+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)

+      vp9_cond_prob_diff_update(&header_bc, &pc->fc.intra_inter_prob[i],

+                                VP9_MODE_UPDATE_PROB,

+                                cpi->intra_inter_count[i]);

-    vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);

-    vp9_write_literal(&header_bc, pc->prob_last_coded, 8);

-    vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);

-    {

+    if (pc->allow_comp_inter_inter) {

       const int comp_pred_mode = cpi->common.comp_pred_mode;

       const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);

       const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);

-      vp9_write(&header_bc, use_compound_pred, 128);

+      vp9_write_bit(&header_bc, use_compound_pred);

       if (use_compound_pred) {

-        vp9_write(&header_bc, use_hybrid_pred, 128);

+        vp9_write_bit(&header_bc, use_hybrid_pred);

         if (use_hybrid_pred) {

-          for (i = 0; i < COMP_PRED_CONTEXTS; i++) {

-            pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],

-                                                   cpi->comp_pred_count[i]);

-            vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);

-          }

+          for (i = 0; i < COMP_INTER_CONTEXTS; i++)

+            vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_inter_prob[i],

+                                      VP9_MODE_UPDATE_PROB,

+                                      cpi->comp_inter_count[i]);

-    update_mbintra_mode_probs(cpi, &header_bc);

-    vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);

-  }

-  /* tiling */

-  {

-    int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;

-    vp9_get_tile_n_bits(pc, &min_log2_tiles, &delta_log2_tiles);

-    n_tile_bits = pc->log2_tile_columns - min_log2_tiles;

-    for (n = 0; n < delta_log2_tiles; n++) {

-      if (n_tile_bits--) {

-        vp9_write_bit(&header_bc, 1);

-      } else {

-        vp9_write_bit(&header_bc, 0);

-        break;

+    if (pc->comp_pred_mode != COMP_PREDICTION_ONLY) {

+      for (i = 0; i < REF_CONTEXTS; i++) {

+        vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][0],

+                                  VP9_MODE_UPDATE_PROB,

+                                  cpi->single_ref_count[i][0]);

+        vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][1],

+                                  VP9_MODE_UPDATE_PROB,

+                                  cpi->single_ref_count[i][1]);

-    vp9_write_bit(&header_bc, pc->log2_tile_rows != 0);

-    if (pc->log2_tile_rows != 0)

-      vp9_write_bit(&header_bc, pc->log2_tile_rows != 1);

-  }

-  vp9_stop_encode(&header_bc);

+    if (pc->comp_pred_mode != SINGLE_PREDICTION_ONLY) {

+      for (i = 0; i < REF_CONTEXTS; i++)

+        vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_ref_prob[i],

+                                  VP9_MODE_UPDATE_PROB,

+                                  cpi->comp_ref_count[i]);

+    }

-  oh.first_partition_length_in_bytes = header_bc.pos;

+    update_mbintra_mode_probs(cpi, &header_bc);

-  /* update frame tag */

-  {

-    int scaling = (pc->width != pc->display_width ||

-                   pc->height != pc->display_height);

-    int v = (oh.first_partition_length_in_bytes << 8) |

-            (scaling << 5) |

-            (oh.show_frame << 4) |

-            (oh.version << 1) |

-            oh.type;

+    for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) {

+      vp9_prob Pnew[PARTITION_TYPES - 1];

+      unsigned int bct[PARTITION_TYPES - 1][2];

+      update_mode(&header_bc, PARTITION_TYPES, vp9_partition_encodings,

+                  vp9_partition_tree, Pnew,

+                  pc->fc.partition_prob[pc->frame_type][i], bct,

+                  (unsigned int *)cpi->partition_count[i]);

+    }

-    assert(oh.first_partition_length_in_bytes <= 0xffff);

-    dest[0] = v;

-    dest[1] = v >> 8;

-    dest[2] = v >> 16;

+    vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);

-  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;

-  if (pc->frame_type == KEY_FRAME) {

-    decide_kf_ymode_entropy(cpi);

-  } else {

-    /* This is not required if the counts in cpi are consistent with the

-     * final packing pass */

-    // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount);

-  }

+  vp9_stop_encode(&header_bc);

+  // first partition size

+  assert(header_bc.pos <= 0xffff);

+  vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16);

+  *size = bytes_packed + header_bc.pos;

     int tile_row, tile_col, total_size = 0;

     unsigned char *data_ptr = cx_data + header_bc.pos;

@@ -2943,11 +1748,8 @@

         write_modes(cpi, &residual_bc, &tok[tile_col], tok_end);

         vp9_stop_encode(&residual_bc);

         if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {

-          /* size of this tile */

-          data_ptr[total_size + 0] = residual_bc.pos;

-          data_ptr[total_size + 1] = residual_bc.pos >> 8;

-          data_ptr[total_size + 2] = residual_bc.pos >> 16;

-          data_ptr[total_size + 3] = residual_bc.pos >> 24;

+          // size of this tile

+          write_be32(data_ptr + total_size, residual_bc.pos);

           total_size += 4;

@@ -2999,21 +1801,18 @@

   FILE *f = fopen("coefupdprob.h", "w");

   fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");

-  print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES,

+  print_tree_update_for_type(f, tree_update_hist[TX_4X4],   BLOCK_TYPES,

                              "vp9_coef_update_probs_4x4[BLOCK_TYPES]");

-  print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES,

+  print_tree_update_for_type(f, tree_update_hist[TX_8X8],   BLOCK_TYPES,

                              "vp9_coef_update_probs_8x8[BLOCK_TYPES]");

-  print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES,

+  print_tree_update_for_type(f, tree_update_hist[TX_16X16], BLOCK_TYPES,

                              "vp9_coef_update_probs_16x16[BLOCK_TYPES]");

-  print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES,

+  print_tree_update_for_type(f, tree_update_hist[TX_32X32], BLOCK_TYPES,

                              "vp9_coef_update_probs_32x32[BLOCK_TYPES]");

   fclose(f);

   f = fopen("treeupdate.bin", "wb");

-  fwrite(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);

-  fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

-  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

-  fwrite(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);

+  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);

   fclose(f);

 #endif

--- a/vp9/encoder/vp9_bitstream.h

+++ b/vp9/encoder/vp9_bitstream.h

@@ -12,6 +12,6 @@

 #ifndef VP9_ENCODER_VP9_BITSTREAM_H_

 #define VP9_ENCODER_VP9_BITSTREAM_H_

-void vp9_update_skip_probs(VP9_COMP *cpi);

+void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc);

 #endif  // VP9_ENCODER_VP9_BITSTREAM_H_

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -23,43 +23,13 @@

   int offset;

 } search_site;

-typedef struct block {

-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

-  int16_t *src_diff;

-  int16_t *coeff;

-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

-  int16_t *quant;

-  int16_t *quant_fast;      // fast quant deprecated for now

-  uint8_t *quant_shift;

-  int16_t *zbin;

-  int16_t *zbin_8x8;

-  int16_t *zbin_16x16;

-  int16_t *zbin_32x32;

-  int16_t *zrun_zbin_boost;

-  int16_t *zrun_zbin_boost_8x8;

-  int16_t *zrun_zbin_boost_16x16;

-  int16_t *zrun_zbin_boost_32x32;

-  int16_t *round;

-  // Zbin Over Quant value

-  short zbin_extra;

-  uint8_t **base_src;

-  uint8_t **base_second_src;

-  int src;

-  int src_stride;

-  int skip_block;

-} BLOCK;

 typedef struct {

   int count;

   struct {

-    B_PREDICTION_MODE mode;

+    MB_PREDICTION_MODE mode;

     int_mv mv;

     int_mv second_mv;

-  } bmi[16];

+  } bmi[4];

 } PARTITION_INFO;

 // Structure to hold snapshot of coding context during the mode picking process

@@ -81,18 +51,36 @@

   int comp_pred_diff;

   int single_pred_diff;

   int64_t txfm_rd_diff[NB_TXFM_MODES];

+  // Bit flag for each mode whether it has high error in comparison to others.

+  unsigned int modes_with_high_error;

+  // Bit flag for each ref frame whether it has high error compared to others.

+  unsigned int frames_with_high_error;

 } PICK_MODE_CONTEXT;

+struct macroblock_plane {

+  DECLARE_ALIGNED(16, int16_t, src_diff[64*64]);

+  DECLARE_ALIGNED(16, int16_t, coeff[64*64]);

+  struct buf_2d src;

+  // Quantizer setings

+  int16_t *quant;

+  uint8_t *quant_shift;

+  int16_t *zbin;

+  int16_t *zrun_zbin_boost;

+  int16_t *round;

+  // Zbin Over Quant value

+  int16_t zbin_extra;

+};

 typedef struct macroblock MACROBLOCK;

 struct macroblock {

-  DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]);

-  DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]);

-  // 16 Y blocks, 4 U blocks, 4 V blocks,

-  BLOCK block[24];

+  struct macroblock_plane plane[MAX_MB_PLANE];

-  YV12_BUFFER_CONFIG src;

   MACROBLOCKD e_mbd;

+  int skip_block;

   PARTITION_INFO *partition_info; /* work pointer */

   PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */

   PARTITION_INFO *pip;  /* Base of allocated array */

@@ -126,11 +114,9 @@

   int *nmvsadcost_hp[2];

   int **mvsadcost;

-  int mbmode_cost[2][MB_MODE_COUNT];

+  int mbmode_cost[MB_MODE_COUNT];

   int intra_uv_mode_cost[2][MB_MODE_COUNT];

-  int bmode_costs[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES];

-  int i8x8_mode_costs[MB_MODE_COUNT];

-  int inter_bmode_costs[B_MODE_COUNT];

+  int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];

   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]

                              [VP9_SWITCHABLE_FILTERS];

@@ -145,36 +131,43 @@

   int encode_breakout;

-  // char * gf_active_ptr;

-  signed char *gf_active_ptr;

   unsigned char *active_ptr;

+  // note that token_costs is the cost when eob node is skipped

   vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];

-#if CONFIG_CODE_NONZEROCOUNT

-  unsigned int nzc_costs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][17];

-  unsigned int nzc_costs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][65];

-  unsigned int nzc_costs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][257];

-  unsigned int nzc_costs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][1025];

-#endif

+  vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES];

   int optimize;

-  // Structure to hold context for each of the 4 MBs within a SB:

-  // when encoded as 4 independent MBs:

+  // TODO(jingning): Need to refactor the structure arrays that buffers the

+  // coding mode decisions of each partition type.

+  PICK_MODE_CONTEXT ab4x4_context[4][4][4];

+  PICK_MODE_CONTEXT sb8x4_context[4][4][4];

+  PICK_MODE_CONTEXT sb4x8_context[4][4][4];

+  PICK_MODE_CONTEXT sb8x8_context[4][4][4];

+  PICK_MODE_CONTEXT sb8x16_context[4][4][2];

+  PICK_MODE_CONTEXT sb16x8_context[4][4][2];

   PICK_MODE_CONTEXT mb_context[4][4];

+  PICK_MODE_CONTEXT sb32x16_context[4][2];

+  PICK_MODE_CONTEXT sb16x32_context[4][2];

   // when 4 MBs share coding parameters:

   PICK_MODE_CONTEXT sb32_context[4];

+  PICK_MODE_CONTEXT sb32x64_context[2];

+  PICK_MODE_CONTEXT sb64x32_context[2];

   PICK_MODE_CONTEXT sb64_context;

+  int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];

+  BLOCK_SIZE_TYPE b_partitioning[4][4][4];

+  BLOCK_SIZE_TYPE mb_partitioning[4][4];

+  BLOCK_SIZE_TYPE sb_partitioning[4];

+  BLOCK_SIZE_TYPE sb64_partitioning;

   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);

   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);

   void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);

   void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);

-  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx);

-  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2);

-  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type);

-  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type);

+  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,

+                         int y_blocks);

};

 #endif  // VP9_ENCODER_VP9_BLOCK_H_

--- a/vp9/encoder/vp9_boolhuff.c

+++ b/vp9/encoder/vp9_boolhuff.c

@@ -10,6 +10,7 @@

 #include <assert.h>

 #include "vp9/encoder/vp9_boolhuff.h"

+#include "vp9/common/vp9_entropy.h"

 #if defined(SECTIONBITS_OUTPUT)

 unsigned __int64 Sectionbits[500];

@@ -39,7 +40,7 @@

   22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1

};

-void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {

+void vp9_start_encode(vp9_writer *br, uint8_t *source) {

   br->lowvalue = 0;

   br->range    = 255;

   br->value    = 0;

@@ -46,13 +47,14 @@

   br->count    = -24;

   br->buffer   = source;

   br->pos      = 0;

+  vp9_write_bit(br, 0);

-void vp9_stop_encode(BOOL_CODER *br) {

+void vp9_stop_encode(vp9_writer *br) {

   int i;

   for (i = 0; i < 32; i++)

-    encode_bool(br, 0, 128);

+    vp9_write_bit(br, 0);

   // Ensure there's no ambigous collision with any index marker bytes

   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)

@@ -59,107 +61,3 @@

     br->buffer[br->pos++] = 0;

-void vp9_encode_value(BOOL_CODER *br, int data, int bits) {

-  int bit;

-  for (bit = bits - 1; bit >= 0; bit--)

-    encode_bool(br, (1 & (data >> bit)), 0x80);

-}

-void vp9_encode_unsigned_max(BOOL_CODER *br, int data, int max) {

-  assert(data <= max);

-  while (max) {

-    encode_bool(br, data & 1, 128);

-    data >>= 1;

-    max >>= 1;

-  }

-}

-int vp9_recenter_nonneg(int v, int m) {

-  if (v > (m << 1)) return v;

-  else if (v >= m) return ((v - m) << 1);

-  else return ((m - v) << 1) - 1;

-}

-static int get_unsigned_bits(unsigned num_values) {

-  int cat = 0;

-  if ((num_values--) <= 1) return 0;

-  while (num_values > 0) {

-    cat++;

-    num_values >>= 1;

-  }

-  return cat;

-}

-void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {

-  int l = get_unsigned_bits(n);

-  int m;

-  if (l == 0) return;

-  m = (1 << l) - n;

-  if (v < m)

-    vp9_encode_value(br, v, l - 1);

-  else {

-    vp9_encode_value(br, m + ((v - m) >> 1), l - 1);

-    vp9_encode_value(br, (v - m) & 1, 1);

-  }

-}

-int vp9_count_uniform(int v, int n) {

-  int l = get_unsigned_bits(n);

-  int m;

-  if (l == 0) return 0;

-  m = (1 << l) - n;

-  if (v < m)

-    return l - 1;

-  else

-    return l;

-}

-void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {

-  int i = 0;

-  int mk = 0;

-  while (1) {

-    int b = (i ? k + i - 1 : k);

-    int a = (1 << b);

-    if (num_syms <= mk + 3 * a) {

-      vp9_encode_uniform(br, word - mk, num_syms - mk);

-      break;

-    } else {

-      int t = (word >= mk + a);

-      vp9_encode_value(br, t, 1);

-      if (t) {

-        i = i + 1;

-        mk += a;

-      } else {

-        vp9_encode_value(br, word - mk, b);

-        break;

-      }

-    }

-  }

-}

-int vp9_count_term_subexp(int word, int k, int num_syms) {

-  int count = 0;

-  int i = 0;

-  int mk = 0;

-  while (1) {

-    int b = (i ? k + i - 1 : k);

-    int a = (1 << b);

-    if (num_syms <= mk + 3 * a) {

-      count += vp9_count_uniform(word - mk, num_syms - mk);

-      break;

-    } else {

-      int t = (word >= mk + a);

-      count++;

-      if (t) {

-        i = i + 1;

-        mk += a;

-      } else {

-        count += b;

-        break;

-      }

-    }

-  }

-  return count;

-}

--- a/vp9/encoder/vp9_boolhuff.h

+++ b/vp9/encoder/vp9_boolhuff.h

@@ -27,30 +27,21 @@

   unsigned int value;

   int count;

   unsigned int pos;

-  unsigned char *buffer;

+  uint8_t *buffer;

   // Variables used to track bit costs without outputing to the bitstream

   unsigned int  measure_cost;

   unsigned long bit_counter;

-} BOOL_CODER;

+} vp9_writer;

-extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);

-extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);

-extern void vp9_encode_unsigned_max(BOOL_CODER *br, int data, int max);

-extern void vp9_stop_encode(BOOL_CODER *bc);

 extern const unsigned int vp9_prob_cost[256];

-extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);

-extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);

-extern int vp9_count_uniform(int v, int n);

-extern int vp9_count_term_subexp(int v, int k, int n);

-extern int vp9_recenter_nonneg(int v, int m);

+void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);

+void vp9_stop_encode(vp9_writer *bc);

 DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);

-static void encode_bool(BOOL_CODER *br, int bit, int probability) {

+static void vp9_write(vp9_writer *br, int bit, int probability) {

   unsigned int split;

   int count = br->count;

   unsigned int range = br->range;

@@ -89,7 +80,7 @@

       int x = br->pos - 1;

       while (x >= 0 && br->buffer[x] == 0xff) {

-        br->buffer[x] = (unsigned char)0;

+        br->buffer[x] = 0;

         x--;

@@ -108,5 +99,17 @@

   br->lowvalue = lowvalue;

   br->range = range;

+static void vp9_write_bit(vp9_writer *w, int bit) {

+  vp9_write(w, bit, 128);  // vp9_prob_half

+}

+static void vp9_write_literal(vp9_writer *w, int data, int bits) {

+  int bit;

+  for (bit = bits - 1; bit >= 0; bit--)

+    vp9_write_bit(w, 1 & (data >> bit));

+}

 #endif  // VP9_ENCODER_VP9_BOOLHUFF_H_

--- a/vp9/encoder/vp9_dct.c

+++ b/vp9/encoder/vp9_dct.c

@@ -591,23 +591,32 @@

+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per

+   pixel. */

 void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {

   int i;

-  int a1, b1, c1, d1;

+  int a1, b1, c1, d1, e1;

   short *ip = input;

   short *op = output;

   int pitch_short = pitch >> 1;

   for (i = 0; i < 4; i++) {

-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

+    a1 = ip[0 * pitch_short];

+    b1 = ip[1 * pitch_short];

+    c1 = ip[2 * pitch_short];

+    d1 = ip[3 * pitch_short];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

+    a1 += b1;

+    d1 = d1 - c1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= c1;

+    d1 += b1;

+    op[0] = a1;

+    op[4] = c1;

+    op[8] = d1;

+    op[12] = b1;

     ip++;

     op++;

@@ -616,15 +625,22 @@

   op = output;

   for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[3];

-    b1 = ip[1] + ip[2];

-    c1 = ip[1] - ip[2];

-    d1 = ip[0] - ip[3];

+    a1 = ip[0];

+    b1 = ip[1];

+    c1 = ip[2];

+    d1 = ip[3];

-    op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;

+    a1 += b1;

+    d1 -= c1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= c1;

+    d1 += b1;

+    op[0] = a1 << WHT_UPSCALE_FACTOR;

+    op[1] = c1 << WHT_UPSCALE_FACTOR;

+    op[2] = d1 << WHT_UPSCALE_FACTOR;

+    op[3] = b1 << WHT_UPSCALE_FACTOR;

     ip += 4;

     op += 4;

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -10,6 +10,7 @@

 #include "./vpx_config.h"

+#include "./vp9_rtcd.h"

 #include "vp9/encoder/vp9_encodeframe.h"

 #include "vp9/encoder/vp9_encodemb.h"

 #include "vp9/encoder/vp9_encodemv.h"

@@ -20,7 +21,6 @@

 #include "vp9/common/vp9_entropymode.h"

 #include "vp9/common/vp9_quant_common.h"

 #include "vp9/encoder/vp9_segmentation.h"

-#include "vp9/common/vp9_setupintrarecon.h"

 #include "vp9/encoder/vp9_encodeintra.h"

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_invtrans.h"

@@ -47,29 +47,12 @@

 void vp9_select_interp_filter_type(VP9_COMP *cpi);

-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,

-                              int output_enabled, int mb_row, int mb_col);

+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,

+                              int output_enabled, int mi_row, int mi_col,

+                              BLOCK_SIZE_TYPE bsize);

-static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int output_enabled, int mb_row, int mb_col);

-static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int output_enabled, int mb_row, int mb_col);

 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);

-#ifdef MODE_STATS

-unsigned int inter_y_modes[MB_MODE_COUNT];

-unsigned int inter_uv_modes[VP9_UV_MODES];

-unsigned int inter_b_modes[B_MODE_COUNT];

-unsigned int y_modes[VP9_YMODES];

-unsigned int i8x8_modes[VP9_I8X8_MODES];

-unsigned int uv_modes[VP9_UV_MODES];

-unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];

-unsigned int b_modes[B_MODE_COUNT];

-#endif

 /* activity_avg must be positive, or flat regions could get a zero weight

  *  (infinite lambda), which confounds analysis.

  * This also avoids the need for divide by zero checks in

@@ -98,8 +81,8 @@

    *  lambda using a non-linear combination (e.g., the smallest, or second

    *  smallest, etc.).

*/

-  act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,

-                          &sse);

+  act = vp9_variance16x16(x->plane[0].src.buf, x->plane[0].src.stride,

+                          VP9_VAR_OFFS, 0, &sse);

   act <<= 4;

   /* If the region is flat, lower the activity some more. */

@@ -115,7 +98,9 @@

   return vp9_encode_intra(cpi, x, use_dc_pred);

+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };

 // Measure the activity of the current macroblock

 // What we measure here is TBD so abstracted to this function

 #define ALT_ACT_MEASURE 1

@@ -280,7 +265,7 @@

     // for each macroblock col in image

     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

 #if ALT_ACT_MEASURE

-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;

+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;

       xd->left_available = (mb_col != 0);

       recon_yoffset += 16;

 #endif

@@ -298,19 +283,12 @@

       x->mb_activity_ptr++;

       // adjust to the next column of source macroblocks

-      x->src.y_buffer += 16;

+      x->plane[0].src.buf += 16;

     // adjust to the next row of mbs

-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

-#if ALT_ACT_MEASURE

-    // extend the recon for intra prediction

-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,

-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);

-#endif

+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;

   // Calculate an "average" MB activity

@@ -347,89 +325,9 @@

   adjust_act_zbin(cpi, x);

-#if CONFIG_NEW_MVREF

-static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {

-  int cost;

-  // Encode the index for the MV reference.

-  switch (mv_ref_id) {

-    case 0:

-      cost = vp9_cost_zero(ref_id_probs[0]);

-      break;

-    case 1:

-      cost = vp9_cost_one(ref_id_probs[0]);

-      cost += vp9_cost_zero(ref_id_probs[1]);

-      break;

-    case 2:

-      cost = vp9_cost_one(ref_id_probs[0]);

-      cost += vp9_cost_one(ref_id_probs[1]);

-      cost += vp9_cost_zero(ref_id_probs[2]);

-      break;

-    case 3:

-      cost = vp9_cost_one(ref_id_probs[0]);

-      cost += vp9_cost_one(ref_id_probs[1]);

-      cost += vp9_cost_one(ref_id_probs[2]);

-      break;

-      // TRAP.. This should not happen

-    default:

-      assert(0);

-      break;

-  }

-  return cost;

-}

-// Estimate the cost of each coding the vector using each reference candidate

-static unsigned int pick_best_mv_ref(MACROBLOCK *x,

-                                     MV_REFERENCE_FRAME ref_frame,

-                                     int_mv target_mv,

-                                     int_mv * mv_ref_list,

-                                     int_mv * best_ref) {

-  int i;

-  int best_index = 0;

-  int cost, cost2;

-  int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int max_mv = MV_MAX;

-  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_probs[ref_frame], 0) +

-         vp9_mv_bit_cost(&target_mv, &mv_ref_list[0], x->nmvjointcost,

-                         x->mvcost, 96, xd->allow_high_precision_mv);

-  for (i = 1; i < MAX_MV_REF_CANDIDATES; ++i) {

-    // If we see a 0,0 reference vector for a second time we have reached

-    // the end of the list of valid candidate vectors.

-    if (!mv_ref_list[i].as_int) {

-      if (zero_seen)

-        break;

-      else

-        zero_seen = TRUE;

-    }

-    // Check for cases where the reference choice would give rise to an

-    // uncodable/out of range residual for row or col.

-    if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||

-        (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {

-      continue;

-    }

-    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_probs[ref_frame], i) +

-            vp9_mv_bit_cost(&target_mv, &mv_ref_list[i], x->nmvjointcost,

-                            x->mvcost, 96, xd->allow_high_precision_mv);

-    if (cost2 < cost) {

-      cost = cost2;

-      best_index = i;

-    }

-  }

-  best_ref->as_int = mv_ref_list[best_index].as_int;

-  return best_index;

-}

-#endif

 static void update_state(VP9_COMP *cpi,

-                         PICK_MODE_CONTEXT *ctx, int block_size,

+                         PICK_MODE_CONTEXT *ctx,

+                         BLOCK_SIZE_TYPE bsize,

                          int output_enabled) {

   int i, x_idx, y;

   MACROBLOCK *const x = &cpi->mb;

@@ -436,49 +334,42 @@

   MACROBLOCKD *const xd = &x->e_mbd;

   MODE_INFO *mi = &ctx->mic;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  int mb_mode = mi->mbmi.mode;

+#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS

+  MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;

+#endif

   int mb_mode_index = ctx->best_mode_index;

   const int mis = cpi->common.mode_info_stride;

-  int mb_block_size = 1 << mi->mbmi.sb_type;

+  const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);

 #if CONFIG_DEBUG

   assert(mb_mode < MB_MODE_COUNT);

   assert(mb_mode_index < MAX_MODES);

-  assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);

+  assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);

+  assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);

 #endif

-  assert(mi->mbmi.sb_type == (block_size >> 5));

+  assert(mi->mbmi.sb_type == bsize);

   // Restore the coding context of the MB to that that was in place

   // when the mode was picked for it

-  for (y = 0; y < mb_block_size; y++) {

-    for (x_idx = 0; x_idx < mb_block_size; x_idx++) {

-      if ((xd->mb_to_right_edge >> 7) + mb_block_size > x_idx &&

-          (xd->mb_to_bottom_edge >> 7) + mb_block_size > y) {

+  for (y = 0; y < bh; y++) {

+    for (x_idx = 0; x_idx < bw; x_idx++) {

+      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx &&

+          (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {

         MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;

-        vpx_memcpy(mi_addr, mi, sizeof(MODE_INFO));

+        *mi_addr = *mi;

-  if (block_size == 16) {

+  if (bsize < BLOCK_SIZE_SB32X32) {

+    if (bsize < BLOCK_SIZE_MB16X16)

+      ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];

     ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];

-  if (mb_mode == B_PRED) {

-    for (i = 0; i < 16; i++) {

-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;

-      assert(xd->block[i].bmi.as_mode.first < B_MODE_COUNT);

-    }

-  } else if (mb_mode == I8X8_PRED) {

-    for (i = 0; i < 16; i++) {

-      xd->block[i].bmi = xd->mode_info_context->bmi[i];

-    }

-  } else if (mb_mode == SPLITMV) {

-    vpx_memcpy(x->partition_info, &ctx->partition_info,

-               sizeof(PARTITION_INFO));

-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;

-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;

+  if (mbmi->ref_frame[0] != INTRA_FRAME && mbmi->sb_type < BLOCK_SIZE_SB8X8) {

+    *x->partition_info = ctx->partition_info;

+    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;

+    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;

   x->skip = ctx->skip;

@@ -485,18 +376,15 @@

   if (!output_enabled)

     return;

-  {

-    int segment_id = mbmi->segment_id;

-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];

-      }

+  if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {

+    for (i = 0; i < NB_TXFM_MODES; i++) {

+      cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];

   if (cpi->common.frame_type == KEY_FRAME) {

     // Restore the coding modes to that held in the coding context

-    // if (mb_mode == B_PRED)

+    // if (mb_mode == I4X4_PRED)

     //    for (i = 0; i < 16; i++)

     //    {

     //        xd->block[i].bmi.as_mode =

@@ -515,8 +403,7 @@

       THR_D27_PRED /*D27_PRED*/,

       THR_D63_PRED /*D63_PRED*/,

       THR_TM /*TM_PRED*/,

-      THR_I8X8_PRED /*I8X8_PRED*/,

-      THR_B_PRED /*B_PRED*/,

+      THR_B_PRED /*I4X4_PRED*/,

};

     cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;

 #endif

@@ -541,57 +428,34 @@

*/

     // Note how often each mode chosen as best

     cpi->mode_chosen_counts[mb_mode_index]++;

-    if (mbmi->mode == SPLITMV || mbmi->mode == NEWMV) {

+    if (mbmi->ref_frame[0] != INTRA_FRAME &&

+        (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {

       int_mv best_mv, best_second_mv;

-      MV_REFERENCE_FRAME rf = mbmi->ref_frame;

-#if CONFIG_NEW_MVREF

-      unsigned int best_index;

-      MV_REFERENCE_FRAME sec_ref_frame = mbmi->second_ref_frame;

-#endif

+      const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];

+      const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];

       best_mv.as_int = ctx->best_ref_mv.as_int;

       best_second_mv.as_int = ctx->second_best_ref_mv.as_int;

       if (mbmi->mode == NEWMV) {

-        best_mv.as_int = mbmi->ref_mvs[rf][0].as_int;

-        best_second_mv.as_int = mbmi->ref_mvs[mbmi->second_ref_frame][0].as_int;

-#if CONFIG_NEW_MVREF

-        best_index = pick_best_mv_ref(x, rf, mbmi->mv[0],

-                                      mbmi->ref_mvs[rf], &best_mv);

-        mbmi->best_index = best_index;

-        ++cpi->mb_mv_ref_count[rf][best_index];

-        if (mbmi->second_ref_frame > 0) {

-          unsigned int best_index;

-          best_index =

-              pick_best_mv_ref(x, sec_ref_frame, mbmi->mv[1],

-                               mbmi->ref_mvs[sec_ref_frame],

-                               &best_second_mv);

-          mbmi->best_second_index = best_index;

-          ++cpi->mb_mv_ref_count[sec_ref_frame][best_index];

-        }

-#endif

+        best_mv.as_int = mbmi->ref_mvs[rf1][0].as_int;

+        best_second_mv.as_int = mbmi->ref_mvs[rf2][0].as_int;

       mbmi->best_mv.as_int = best_mv.as_int;

       mbmi->best_second_mv.as_int = best_second_mv.as_int;

       vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&

-        mbmi->second_ref_frame <= INTRA_FRAME) {

-      if (mbmi->second_ref_frame == INTRA_FRAME) {

-        ++cpi->interintra_count[1];

-        ++cpi->ymode_count[mbmi->interintra_mode];

-#if SEPARATE_INTERINTRA_UV

-        ++cpi->y_uv_mode_count[mbmi->interintra_mode][mbmi->interintra_uv_mode];

-#endif

-      } else {

-        ++cpi->interintra_count[0];

-      }

+    if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {

+      int i, j;

+      for (j = 0; j < bh; ++j)

+        for (i = 0; i < bw; ++i)

+          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i &&

+              (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)

+            xd->mode_info_context[mis * j + i].mbmi = *mbmi;

-#endif

     if (cpi->common.mcomp_filter_type == SWITCHABLE &&

-        mbmi->mode >= NEARESTMV &&

-        mbmi->mode <= SPLITMV) {

-      ++cpi->switchable_interp_count

+        is_inter_mode(mbmi->mode)) {

+      ++cpi->common.fc.switchable_interp_count

           [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]

           [vp9_switchable_interp_map[mbmi->interp_filter]];

@@ -602,14 +466,16 @@

-static unsigned find_seg_id(uint8_t *buf, int block_size,

+static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize,

                             int start_y, int height, int start_x, int width) {

-  const int end_x = MIN(start_x + block_size, width);

-  const int end_y = MIN(start_y + block_size, height);

+  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);

+  const int end_x = MIN(start_x + bw, width);

+  const int end_y = MIN(start_y + bh, height);

   int x, y;

   unsigned seg_id = -1;

   buf += width * start_y;

+  assert(start_y < cm->mi_rows && start_x < cm->cur_tile_mi_col_end);

   for (y = start_y; y < end_y; y++, buf += width) {

     for (x = start_x; x < end_x; x++) {

       seg_id = MIN(seg_id, buf[x]);

@@ -619,22 +485,48 @@

   return seg_id;

+void vp9_setup_src_planes(MACROBLOCK *x,

+                          const YV12_BUFFER_CONFIG *src,

+                          int mb_row, int mb_col) {

+  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,

+                         src->alpha_buffer};

+  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,

+                    src->alpha_stride};

+  int i;

+  for (i = 0; i < MAX_MB_PLANE; i++) {

+    setup_pred_plane(&x->plane[i].src,

+                     buffers[i], strides[i],

+                     mb_row, mb_col, NULL,

+                     x->e_mbd.plane[i].subsampling_x,

+                     x->e_mbd.plane[i].subsampling_y);

+  }

+}

 static void set_offsets(VP9_COMP *cpi,

-                        int mb_row, int mb_col, int block_size) {

+                        int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {

   MACROBLOCK *const x = &cpi->mb;

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *mbmi;

   const int dst_fb_idx = cm->new_fb_idx;

+  const int idx_str = xd->mode_info_stride * mi_row + mi_col;

+  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);

+  const int mb_row = mi_row >> 1;

+  const int mb_col = mi_col >> 1;

   const int idx_map = mb_row * cm->mb_cols + mb_col;

-  const int idx_str = xd->mode_info_stride * mb_row + mb_col;

+  int i;

   // entropy context structures

-  xd->above_context = cm->above_context + mb_col;

-  xd->left_context  = cm->left_context + (mb_row & 3);

+  for (i = 0; i < MAX_MB_PLANE; i++) {

+    xd->plane[i].above_context = cm->above_context[i] +

+        (mi_col * 2 >>  xd->plane[i].subsampling_x);

+    xd->plane[i].left_context = cm->left_context[i] +

+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);

+  }

-  // GF active flags data structure

-  x->gf_active_ptr = (signed char *)&cpi->gf_active_flags[idx_map];

+  // partition contexts

+  set_partition_seg_context(cm, xd, mi_row, mi_col);

   // Activity map pointer

   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];

@@ -644,30 +536,29 @@

   x->partition_info          = x->pi + idx_str;

   xd->mode_info_context      = cm->mi + idx_str;

   mbmi = &xd->mode_info_context->mbmi;

-  xd->prev_mode_info_context = cm->prev_mi + idx_str;

+  // Special case: if prev_mi is NULL, the previous mode info context

+  // cannot be used.

+  xd->prev_mode_info_context = cm->prev_mi ?

+                                 cm->prev_mi + idx_str : NULL;

   // Set up destination pointers

-  setup_pred_block(&xd->dst,

-                   &cm->yv12_fb[dst_fb_idx],

-                   mb_row, mb_col, NULL, NULL);

+  setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);

   /* Set up limit values for MV components to prevent them from

    * extending beyond the UMV borders assuming 16x16 block size */

-  x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);

-  x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);

-  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

-                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));

-  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

-                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));

+  x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);

+  x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);

+  x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE +

+                   (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));

+  x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE +

+                   (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));

   // Set up distance of MB to edge of frame in 1/8th pel units

-  block_size >>= 4;  // in macroblock units

-  assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1)));

-  set_mb_row(cm, xd, mb_row, block_size);

-  set_mb_col(cm, xd, mb_col, block_size);

+  assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1)));

+  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);

   /* set up source buffers */

-  setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);

+  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);

   /* R/D setup */

   x->rddiv = cpi->RDDIV;

@@ -675,23 +566,17 @@

   /* segment ID */

   if (xd->segmentation_enabled) {

-    if (xd->update_mb_segmentation_map) {

-      mbmi->segment_id = find_seg_id(cpi->segmentation_map, block_size,

-                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);

-    } else {

-      mbmi->segment_id = find_seg_id(cm->last_frame_seg_map, block_size,

-                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);

-    }

-    assert(mbmi->segment_id <= 3);

+    uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map

+                                                  : cm->last_frame_seg_map;

+    mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row,

+                                   cm->mi_rows, mi_col, cm->mi_cols);

+    assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));

     vp9_mb_init_quantizer(cpi, x);

     if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&

         !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&

-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&

-        vp9_check_segref(xd, 1, INTRA_FRAME)  +

-        vp9_check_segref(xd, 1, LAST_FRAME)   +

-        vp9_check_segref(xd, 1, GOLDEN_FRAME) +

-        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {

+        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {

       cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;

     } else {

       const int y = mb_row & ~3;

@@ -698,8 +583,10 @@

       const int x = mb_col & ~3;

       const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);

       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);

-      const int tile_progress = cm->cur_tile_mb_col_start * cm->mb_rows;

-      const int mb_cols = cm->cur_tile_mb_col_end - cm->cur_tile_mb_col_start;

+      const int tile_progress =

+          cm->cur_tile_mi_col_start * cm->mb_rows >> 1;

+      const int mb_cols =

+          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;

       cpi->seg0_progress =

           ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;

@@ -709,453 +596,824 @@

-static int pick_mb_modes(VP9_COMP *cpi,

-                         int mb_row0,

-                         int mb_col0,

-                         TOKENEXTRA **tp,

-                         int *totalrate,

-                         int *totaldist) {

+static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,

+                          TOKENEXTRA **tp, int *totalrate, int *totaldist,

+                          BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  int i;

-  int splitmodes_used = 0;

-  ENTROPY_CONTEXT_PLANES left_context[2];

-  ENTROPY_CONTEXT_PLANES above_context[2];

-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context

-                                                      + mb_col0;

-  /* Function should not modify L & A contexts; save and restore on exit */

-  vpx_memcpy(left_context,

-             cm->left_context + (mb_row0 & 2),

-             sizeof(left_context));

-  vpx_memcpy(above_context,

-             initial_above_context_ptr,

-             sizeof(above_context));

+  if (bsize < BLOCK_SIZE_SB8X8)

+    if (xd->ab_index != 0)

+      return;

-  /* Encode MBs in raster order within the SB */

-  for (i = 0; i < 4; i++) {

-    const int x_idx = i & 1, y_idx = i >> 1;

-    const int mb_row = mb_row0 + y_idx;

-    const int mb_col = mb_col0 + x_idx;

-    MB_MODE_INFO *mbmi;

+  set_offsets(cpi, mi_row, mi_col, bsize);

+  xd->mode_info_context->mbmi.sb_type = bsize;

+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

+    vp9_activity_masking(cpi, x);

-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {

-      // MB lies outside frame, move on

-      continue;

-    }

+  /* Find best coding mode & reconstruct the MB so it is available

+   * as a predictor for MBs that follow in the SB */

+  if (cm->frame_type == KEY_FRAME) {

+    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx);

+  } else {

+    vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,

+                              bsize, ctx);

+  }

+}

-    // Index of the MB in the SB 0..3

-    xd->mb_index = i;

-    set_offsets(cpi, mb_row, mb_col, 16);

+static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCK *const x = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MODE_INFO *mi = xd->mode_info_context;

+  MB_MODE_INFO *const mbmi = &mi->mbmi;

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

-      vp9_activity_masking(cpi, x);

+  if (cm->frame_type != KEY_FRAME) {

+    int segment_id, seg_ref_active;

-    mbmi = &xd->mode_info_context->mbmi;

-    mbmi->sb_type = BLOCK_SIZE_MB16X16;

+    segment_id = mbmi->segment_id;

+    seg_ref_active = vp9_segfeature_active(xd, segment_id,

+                                           SEG_LVL_REF_FRAME);

-    // Find best coding mode & reconstruct the MB so it is available

-    // as a predictor for MBs that follow in the SB

-    if (cm->frame_type == KEY_FRAME) {

-      int r, d;

-#if 0  // ENC_DEBUG

-      if (enc_debug)

-        printf("intra pick_mb_modes %d %d\n", mb_row, mb_col);

-#endif

-      vp9_rd_pick_intra_mode(cpi, x, &r, &d);

-      *totalrate += r;

-      *totaldist += d;

+    if (!seg_ref_active)

+      cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]

+                            [mbmi->ref_frame[0] > INTRA_FRAME]++;

-      // Dummy encode, do not do the tokenization

-      encode_macroblock(cpi, tp, 0, mb_row, mb_col);

+    // If the segment reference feature is enabled we have only a single

+    // reference frame allowed for the segment so exclude it from

+    // the reference frame counts used to work out probabilities.

+    if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {

+      if (cm->comp_pred_mode == HYBRID_PREDICTION)

+        cpi->comp_inter_count[vp9_get_pred_context(cm, xd,

+                                                   PRED_COMP_INTER_INTER)]

+                             [mbmi->ref_frame[1] > INTRA_FRAME]++;

-      // Note the encoder may have changed the segment_id

-      // Save the coding context

-      vpx_memcpy(&x->mb_context[xd->sb_index][i].mic, xd->mode_info_context,

-                 sizeof(MODE_INFO));

-    } else {

-      int seg_id, r, d;

-#if 0  // ENC_DEBUG

-      if (enc_debug)

-        printf("inter pick_mb_modes %d %d\n", mb_row, mb_col);

-#endif

-      vp9_pick_mode_inter_macroblock(cpi, x, mb_row, mb_col, &r, &d);

-      *totalrate += r;

-      *totaldist += d;

-      splitmodes_used += (mbmi->mode == SPLITMV);

-      // Dummy encode, do not do the tokenization

-      encode_macroblock(cpi, tp, 0, mb_row, mb_col);

-      seg_id = mbmi->segment_id;

-      if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {

-        cpi->seg0_idx++;

+      if (mbmi->ref_frame[1] > INTRA_FRAME) {

+        cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)]

+                           [mbmi->ref_frame[0] == GOLDEN_FRAME]++;

+      } else {

+        cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)]

+                             [0][mbmi->ref_frame[0] != LAST_FRAME]++;

+        if (mbmi->ref_frame[0] != LAST_FRAME)

+          cpi->single_ref_count[vp9_get_pred_context(cm, xd,

+                                                     PRED_SINGLE_REF_P2)]

+                               [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++;

-      if (!xd->segmentation_enabled ||

-          !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||

-          vp9_check_segref(xd, seg_id, INTRA_FRAME)  +

-          vp9_check_segref(xd, seg_id, LAST_FRAME)   +

-          vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +

-          vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {

-        // Get the prediction context and status

-        int pred_flag = vp9_get_pred_flag(xd, PRED_REF);

-        int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);

-        // Count prediction success

-        cpi->ref_pred_count[pred_context][pred_flag]++;

-      }

+    // Count of last ref frame 0,0 usage

+    if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame[0] == LAST_FRAME))

+      cpi->inter_zz_count++;

+}

-  /* Restore L & A coding context to those in place on entry */

-  vpx_memcpy(cm->left_context + (mb_row0 & 2),

-             left_context,

-             sizeof(left_context));

-  vpx_memcpy(initial_above_context_ptr,

-             above_context,

-             sizeof(above_context));

+// TODO(jingning): the variables used here are little complicated. need further

+// refactoring on organizing the the temporary buffers, when recursive

+// partition down to 4x4 block size is enabled.

+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,

+                                            BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD *const xd = &x->e_mbd;

-  return splitmodes_used;

+  switch (bsize) {

+    case BLOCK_SIZE_SB64X64:

+      return &x->sb64_context;

+    case BLOCK_SIZE_SB64X32:

+      return &x->sb64x32_context[xd->sb_index];

+    case BLOCK_SIZE_SB32X64:

+      return &x->sb32x64_context[xd->sb_index];

+    case BLOCK_SIZE_SB32X32:

+      return &x->sb32_context[xd->sb_index];

+    case BLOCK_SIZE_SB32X16:

+      return &x->sb32x16_context[xd->sb_index][xd->mb_index];

+    case BLOCK_SIZE_SB16X32:

+      return &x->sb16x32_context[xd->sb_index][xd->mb_index];

+    case BLOCK_SIZE_MB16X16:

+      return &x->mb_context[xd->sb_index][xd->mb_index];

+    case BLOCK_SIZE_SB16X8:

+      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];

+    case BLOCK_SIZE_SB8X16:

+      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];

+    case BLOCK_SIZE_SB8X8:

+      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];

+    case BLOCK_SIZE_SB8X4:

+      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];

+    case BLOCK_SIZE_SB4X8:

+      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];

+    case BLOCK_SIZE_AB4X4:

+      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];

+    default:

+      assert(0);

+      return NULL;

+  }

-static void pick_sb_modes(VP9_COMP *cpi,

-                          int mb_row,

-                          int mb_col,

-                          TOKENEXTRA **tp,

-                          int *totalrate,

-                          int *totaldist) {

+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,

+                                            BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  switch (bsize) {

+    case BLOCK_SIZE_SB64X64:

+      return &x->sb64_partitioning;

+    case BLOCK_SIZE_SB32X32:

+      return &x->sb_partitioning[xd->sb_index];

+    case BLOCK_SIZE_MB16X16:

+      return &x->mb_partitioning[xd->sb_index][xd->mb_index];

+    case BLOCK_SIZE_SB8X8:

+      return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];

+    default:

+      assert(0);

+      return NULL;

+  }

+}

+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,

+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],

+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],

+                            PARTITION_CONTEXT sa[8],

+                            PARTITION_CONTEXT sl[8],

+                            BLOCK_SIZE_TYPE bsize) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

+  int p;

+  int bwl = b_width_log2(bsize), bw = 1 << bwl;

+  int bhl = b_height_log2(bsize), bh = 1 << bhl;

+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;

+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;

+  for (p = 0; p < MAX_MB_PLANE; p++) {

+    vpx_memcpy(cm->above_context[p] +

+               ((mi_col * 2) >> xd->plane[p].subsampling_x),

+               a + bw * p,

+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);

+    vpx_memcpy(cm->left_context[p] +

+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),

+               l + bh * p,

+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);

+  }

+  vpx_memcpy(cm->above_seg_context + mi_col, sa,

+             sizeof(PARTITION_CONTEXT) * mw);

+  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,

+             sizeof(PARTITION_CONTEXT) * mh);

+}

+static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,

+                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],

+                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],

+                          PARTITION_CONTEXT sa[8],

+                          PARTITION_CONTEXT sl[8],

+                          BLOCK_SIZE_TYPE bsize) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCK *const x = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int p;

+  int bwl = b_width_log2(bsize), bw = 1 << bwl;

+  int bhl = b_height_log2(bsize), bh = 1 << bhl;

+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;

+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;

-  set_offsets(cpi, mb_row, mb_col, 32);

-  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32;

-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

-    vp9_activity_masking(cpi, x);

-  /* Find best coding mode & reconstruct the MB so it is available

-   * as a predictor for MBs that follow in the SB */

-  if (cm->frame_type == KEY_FRAME) {

-    vp9_rd_pick_intra_mode_sb32(cpi, x,

-                                totalrate,

-                                totaldist);

-    /* Save the coding context */

-    vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context,

-               sizeof(MODE_INFO));

-  } else {

-    vp9_rd_pick_inter_mode_sb32(cpi, x, mb_row, mb_col, totalrate, totaldist);

+  // buffer the above/left context information of the block in search.

+  for (p = 0; p < MAX_MB_PLANE; ++p) {

+    vpx_memcpy(a + bw * p, cm->above_context[p] +

+               (mi_col * 2 >> xd->plane[p].subsampling_x),

+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);

+    vpx_memcpy(l + bh * p, cm->left_context[p] +

+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),

+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);

+  vpx_memcpy(sa, cm->above_seg_context + mi_col,

+             sizeof(PARTITION_CONTEXT) * mw);

+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),

+             sizeof(PARTITION_CONTEXT) * mh);

-static void pick_sb64_modes(VP9_COMP *cpi,

-                            int mb_row,

-                            int mb_col,

-                            TOKENEXTRA **tp,

-                            int *totalrate,

-                            int *totaldist) {

+static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,

+                     int mi_row, int mi_col, int output_enabled,

+                     BLOCK_SIZE_TYPE bsize, int sub_index) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  set_offsets(cpi, mb_row, mb_col, 64);

-  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;

-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

-    vp9_activity_masking(cpi, x);

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

+    return;

-  /* Find best coding mode & reconstruct the MB so it is available

-   * as a predictor for MBs that follow in the SB */

-  if (cm->frame_type == KEY_FRAME) {

-    vp9_rd_pick_intra_mode_sb64(cpi, x, totalrate, totaldist);

+  if (sub_index != -1)

+    *(get_sb_index(xd, bsize)) = sub_index;

-    /* Save the coding context */

-    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, sizeof(MODE_INFO));

-  } else {

-    vp9_rd_pick_inter_mode_sb64(cpi, x, mb_row, mb_col, totalrate, totaldist);

+  if (bsize < BLOCK_SIZE_SB8X8)

+    if (xd->ab_index > 0)

+      return;

+  set_offsets(cpi, mi_row, mi_col, bsize);

+  update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);

+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);

+  if (output_enabled) {

+    update_stats(cpi, mi_row, mi_col);

+    (*tp)->token = EOSB_TOKEN;

+    (*tp)++;

-static void update_stats(VP9_COMP *cpi, int mb_row, int mb_col) {

+static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,

+                      int mi_row, int mi_col, int output_enabled,

+                      BLOCK_SIZE_TYPE bsize) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  MODE_INFO *mi = xd->mode_info_context;

-  MB_MODE_INFO *const mbmi = &mi->mbmi;

+  BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;

+  const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;

+  int bwl, bhl;

+  int UNINITIALIZED_IS_SAFE(pl);

-  if (cm->frame_type == KEY_FRAME) {

-#ifdef MODE_STATS

-    y_modes[mbmi->mode]++;

-#endif

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

+    return;

+  c1 = BLOCK_SIZE_AB4X4;

+  if (bsize >= BLOCK_SIZE_SB8X8) {

+    set_partition_seg_context(cm, xd, mi_row, mi_col);

+    pl = partition_plane_context(xd, bsize);

+    c1 = *(get_sb_partitioning(x, bsize));

+  }

+  bwl = b_width_log2(c1), bhl = b_height_log2(c1);

+  if (bsl == bwl && bsl == bhl) {

+    if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)

+        cpi->partition_count[pl][PARTITION_NONE]++;

+    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);

+  } else if (bsl == bhl && bsl > bwl) {

+    if (output_enabled)

+      cpi->partition_count[pl][PARTITION_VERT]++;

+    encode_b(cpi, tp, mi_row, mi_col,      output_enabled, c1, 0);

+    encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);

+  } else if (bsl == bwl && bsl > bhl) {

+    if (output_enabled)

+      cpi->partition_count[pl][PARTITION_HORZ]++;

+    encode_b(cpi, tp, mi_row,      mi_col, output_enabled, c1, 0);

+    encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);

   } else {

-    int segment_id, seg_ref_active;

+    BLOCK_SIZE_TYPE subsize;

+    int i;

-    if (mbmi->ref_frame) {

-      int pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);

+    assert(bwl < bsl && bhl < bsl);

+    subsize = get_subsize(bsize, PARTITION_SPLIT);

-      if (mbmi->second_ref_frame <= INTRA_FRAME)

-        cpi->single_pred_count[pred_context]++;

-      else

-        cpi->comp_pred_count[pred_context]++;

+    if (output_enabled)

+      cpi->partition_count[pl][PARTITION_SPLIT]++;

+    for (i = 0; i < 4; i++) {

+      const int x_idx = i & 1, y_idx = i >> 1;

+      *(get_sb_index(xd, subsize)) = i;

+      encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,

+                output_enabled, subsize);

+  }

-#ifdef MODE_STATS

-    inter_y_modes[mbmi->mode]++;

+  if (bsize >= BLOCK_SIZE_SB8X8 &&

+      (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {

+    set_partition_seg_context(cm, xd, mi_row, mi_col);

+    update_partition_context(xd, c1, bsize);

+  }

+}

-    if (mbmi->mode == SPLITMV) {

-      int b;

+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,

+                             BLOCK_SIZE_TYPE bsize) {

+  VP9_COMMON *const cm = &cpi->common;

+  const int mis = cm->mode_info_stride;

+  int bsl = b_width_log2(bsize);

+  int bs = (1 << bsl) / 2;  //

+  int block_row, block_col;

+  int row, col;

-      for (b = 0; b < x->partition_info->count; b++) {

-        inter_b_modes[x->partition_info->bmi[b].mode]++;

+  // this test function sets the entire macroblock to the same bsize

+  for (block_row = 0; block_row < 8; block_row += bs) {

+    for (block_col = 0; block_col < 8; block_col += bs) {

+      for (row = 0; row < bs; row++) {

+        for (col = 0; col < bs; col++) {

+          m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;

+        }

-#endif

+  }

+}

-    // If we have just a single reference frame coded for a segment then

-    // exclude from the reference frame counts used to work out

-    // probabilities. NOTE: At the moment we dont support custom trees

-    // for the reference frame coding for each segment but this is a

-    // possible future action.

-    segment_id = mbmi->segment_id;

-    seg_ref_active = vp9_segfeature_active(xd, segment_id,

-                                           SEG_LVL_REF_FRAME);

-    if (!seg_ref_active ||

-        ((vp9_check_segref(xd, segment_id, INTRA_FRAME) +

-          vp9_check_segref(xd, segment_id, LAST_FRAME) +

-          vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

-          vp9_check_segref(xd, segment_id, ALTREF_FRAME)) > 1)) {

-      cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;

+static void set_block_size(VP9_COMMON *const cm,

+                           MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,

+                           int mi_row, int mi_col) {

+  int row, col;

+  int bwl = b_width_log2(bsize);

+  int bhl = b_height_log2(bsize);

+  int bsl = (bwl > bhl ? bwl : bhl);

+  int bs = (1 << bsl) / 2;  //

+  MODE_INFO *m2 = m + mi_row * mis + mi_col;

+  for (row = 0; row < bs; row++) {

+    for (col = 0; col < bs; col++) {

+      if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)

+        continue;

+      m2[row*mis+col].mbmi.sb_type = bsize;

-    // Count of last ref frame 0,0 usage

-    if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))

-      cpi->inter_zz_count++;

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_update_nzc_counts(&cpi->common, xd, mb_row, mb_col);

-#endif

+typedef struct {

+  int64_t sum_square_error;

+  int64_t sum_error;

+  int count;

+  int variance;

+} var;

-static void encode_sb(VP9_COMP *cpi,

-                      int mb_row,

-                      int mb_col,

-                      int output_enabled,

-                      TOKENEXTRA **tp, int is_sb) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCK *const x = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

+#define VT(TYPE, BLOCKSIZE) \

+  typedef struct { \

+    var none; \

+    var horz[2]; \

+    var vert[2]; \

+    BLOCKSIZE split[4]; } TYPE;

-  cpi->sb32_count[is_sb]++;

-  if (is_sb) {

-    set_offsets(cpi, mb_row, mb_col, 32);

-    update_state(cpi, &x->sb32_context[xd->sb_index], 32, output_enabled);

+VT(v8x8, var)

+VT(v16x16, v8x8)

+VT(v32x32, v16x16)

+VT(v64x64, v32x32)

-    encode_superblock32(cpi, tp,

-                        output_enabled, mb_row, mb_col);

-    if (output_enabled) {

-      update_stats(cpi, mb_row, mb_col);

-    }

+typedef enum {

+  V16X16,

+  V32X32,

+  V64X64,

+} TREE_LEVEL;

-    if (output_enabled) {

-      (*tp)->Token = EOSB_TOKEN;

-      (*tp)++;

-      if (mb_row < cm->mb_rows)

-        cpi->tplist[mb_row].stop = *tp;

-    }

-  } else {

-    int i;

+// Set variance values given sum square error, sum error, count.

+static void fill_variance(var *v, int64_t s2, int64_t s, int c) {

+  v->sum_square_error = s2;

+  v->sum_error = s;

+  v->count = c;

+  v->variance = 256

+      * (v->sum_square_error - v->sum_error * v->sum_error / v->count)

+      / v->count;

+}

-    for (i = 0; i < 4; i++) {

-      const int x_idx = i & 1, y_idx = i >> 1;

+// Combine 2 variance structures by summing the sum_error, sum_square_error,

+// and counts and then calculating the new variance.

+void sum_2_variances(var *r, var *a, var*b) {

+  fill_variance(r, a->sum_square_error + b->sum_square_error,

+                a->sum_error + b->sum_error, a->count + b->count);

+}

+// Fill one level of our variance tree,  by summing the split sums into each of

+// the horizontal, vertical and none from split and recalculating variance.

+#define fill_variance_tree(VT) \

+  sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \

+  sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \

+  sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \

+  sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \

+  sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);

-      if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {

-        // MB lies outside frame, move on

-        continue;

-      }

+// Set the blocksize in the macroblock info structure if the variance is less

+// than our threshold to one of none, horz, vert.

+#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \

+  if (VT.none.variance < threshold) { \

+    set_block_size(cm, m, BLOCKSIZE, mis, R, C); \

+    ACTION; \

+  } \

+  if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \

+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \

+    ACTION; \

+  } \

+  if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \

+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \

+    ACTION; \

+  }

-      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16);

-      xd->mb_index = i;

-      update_state(cpi, &x->mb_context[xd->sb_index][i], 16, output_enabled);

+static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,

+                                int mi_col) {

+  VP9_COMMON * const cm = &cpi->common;

+  MACROBLOCK *x = &cpi->mb;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  const int mis = cm->mode_info_stride;

+  // TODO(JBB): More experimentation or testing of this threshold;

+  int64_t threshold = 4;

+  int i, j, k;

+  v64x64 vt;

+  unsigned char * s;

+  int sp;

+  const unsigned char * d = xd->plane[0].pre->buf;

+  int dp = xd->plane[0].pre->stride;

+  int pixels_wide = 64, pixels_high = 64;

-      if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

-        vp9_activity_masking(cpi, x);

+  vpx_memset(&vt, 0, sizeof(vt));

-      encode_macroblock(cpi, tp,

-                        output_enabled, mb_row + y_idx, mb_col + x_idx);

-      if (output_enabled) {

-        update_stats(cpi, mb_row + y_idx, mb_col + x_idx);

-      }

+  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);

-      if (output_enabled) {

-        (*tp)->Token = EOSB_TOKEN;

-       (*tp)++;

-        if (mb_row + y_idx < cm->mb_rows)

-          cpi->tplist[mb_row + y_idx].stop = *tp;

-      }

+  if (xd->mb_to_right_edge < 0)

+    pixels_wide += (xd->mb_to_right_edge >> 3);

+  if (xd->mb_to_bottom_edge < 0)

+    pixels_high += (xd->mb_to_bottom_edge >> 3);

+  s = x->plane[0].src.buf;

+  sp = x->plane[0].src.stride;

+  // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want

+  // but this needs more experimentation.

+  threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;

+  // if ( cm->frame_type == KEY_FRAME ) {

+  d = vp9_64x64_zeros;

+  dp = 64;

+  // }

+  // Fill in the entire tree of 8x8 variances for splits.

+  for (i = 0; i < 4; i++) {

+    const int x32_idx = ((i & 1) << 5);

+    const int y32_idx = ((i >> 1) << 5);

+    for (j = 0; j < 4; j++) {

+      const int x_idx = x32_idx + ((j & 1) << 4);

+      const int y_idx = y32_idx + ((j >> 1) << 4);

+      const uint8_t *st = s + y_idx * sp + x_idx;

+      const uint8_t *dt = d + y_idx * dp + x_idx;

+      unsigned int sse = 0;

+      int sum = 0;

+      v16x16 *vst = &vt.split[i].split[j];

+      sse = sum = 0;

+      if (x_idx < pixels_wide && y_idx < pixels_high)

+        vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum);

+      fill_variance(&vst->split[0].none, sse, sum, 64);

+      sse = sum = 0;

+      if (x_idx + 8 < pixels_wide && y_idx < pixels_high)

+        vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum);

+      fill_variance(&vst->split[1].none, sse, sum, 64);

+      sse = sum = 0;

+      if (x_idx < pixels_wide && y_idx + 8 < pixels_high)

+        vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum);

+      fill_variance(&vst->split[2].none, sse, sum, 64);

+      sse = sum = 0;

+      if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high)

+        vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse,

+                            &sum);

+      fill_variance(&vst->split[3].none, sse, sum, 64);

+  // Fill the rest of the variance tree by summing the split partition

+  // values.

+  for (i = 0; i < 4; i++) {

+    for (j = 0; j < 4; j++) {

+      fill_variance_tree(&vt.split[i].split[j])

+    }

+    fill_variance_tree(&vt.split[i])

+  }

+  fill_variance_tree(&vt)

-  // debug output

-#if DBG_PRNT_SEGMAP

-  {

-    FILE *statsfile;

-    statsfile = fopen("segmap2.stt", "a");

-    fprintf(statsfile, "\n");

-    fclose(statsfile);

+  // Now go through the entire structure,  splitting every blocksize until

+  // we get to one that's got a variance lower than our threshold,  or we

+  // hit 8x8.

+  set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);

+  for (i = 0; i < 4; ++i) {

+    const int x32_idx = ((i & 1) << 2);

+    const int y32_idx = ((i >> 1) << 2);

+    set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,

+                continue);

+    for (j = 0; j < 4; ++j) {

+      const int x16_idx = ((j & 1) << 1);

+      const int y16_idx = ((j >> 1) << 1);

+      set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,

+                  mi_col+x32_idx+x16_idx, continue);

+      for (k = 0; k < 4; ++k) {

+        const int x8_idx = (k & 1);

+        const int y8_idx = (k >> 1);

+        set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,

+                       mi_row + y32_idx + y16_idx + y8_idx,

+                       mi_col + x32_idx + x16_idx + x8_idx);

+      }

+    }

-#endif

+static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,

+                             int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,

+                             int *rate, int *dist) {

+  VP9_COMMON * const cm = &cpi->common;

+  MACROBLOCK * const x = &cpi->mb;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  const int mis = cm->mode_info_stride;

+  int bwl = b_width_log2(m->mbmi.sb_type);

+  int bhl = b_height_log2(m->mbmi.sb_type);

+  int bsl = b_width_log2(bsize);

+  int bh = (1 << bhl);

+  int bs = (1 << bsl);

+  int bss = (1 << bsl)/4;

+  int i, pl;

+  PARTITION_TYPE partition;

+  BLOCK_SIZE_TYPE subsize;

+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];

+  PARTITION_CONTEXT sl[8], sa[8];

+  int r = 0, d = 0;

-static void encode_sb64(VP9_COMP *cpi,

-                        int mb_row,

-                        int mb_col,

-                        TOKENEXTRA **tp, int is_sb[4]) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCK *const x = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

+    return;

-  cpi->sb64_count[is_sb[0] == 2]++;

-  if (is_sb[0] == 2) {

-    set_offsets(cpi, mb_row, mb_col, 64);

-    update_state(cpi, &x->sb64_context, 64, 1);

-    encode_superblock64(cpi, tp,

-                        1, mb_row, mb_col);

-    update_stats(cpi, mb_row, mb_col);

-    (*tp)->Token = EOSB_TOKEN;

-    (*tp)++;

-    if (mb_row < cm->mb_rows)

-      cpi->tplist[mb_row].stop = *tp;

-  } else {

-    int i;

+  // parse the partition type

+  if ((bwl == bsl) && (bhl == bsl))

+    partition = PARTITION_NONE;

+  else if ((bwl == bsl) && (bhl < bsl))

+    partition = PARTITION_HORZ;

+  else if ((bwl < bsl) && (bhl == bsl))

+    partition = PARTITION_VERT;

+  else if ((bwl < bsl) && (bhl < bsl))

+    partition = PARTITION_SPLIT;

+  else

+    assert(0);

-    for (i = 0; i < 4; i++) {

-      const int x_idx = i & 1, y_idx = i >> 1;

+  subsize = get_subsize(bsize, partition);

-      if (mb_row + y_idx * 2 >= cm->mb_rows ||

-          mb_col + x_idx * 2 >= cm->mb_cols) {

-        // MB lies outside frame, move on

-        continue;

+  // TODO(JBB): this restriction is here because pick_sb_modes can return

+  // r's that are INT_MAX meaning we can't select a mode / mv for this block.

+  // when the code is made to work for less than sb8x8 we need to come up with

+  // a solution to this problem.

+  assert(subsize >= BLOCK_SIZE_SB8X8);

+  if (bsize >= BLOCK_SIZE_SB8X8) {

+    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);

+    xd->above_seg_context = cm->above_seg_context + mi_col;

+    *(get_sb_partitioning(x, bsize)) = subsize;

+  }

+  pl = partition_plane_context(xd, bsize);

+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

+  switch (partition) {

+    case PARTITION_NONE:

+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,

+                    get_block_context(x, bsize));

+      r += x->partition_cost[pl][PARTITION_NONE];

+      break;

+    case PARTITION_HORZ:

+      *(get_sb_index(xd, subsize)) = 0;

+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,

+                    get_block_context(x, subsize));

+      if (mi_row + (bh >> 1) <= cm->mi_rows) {

+        int rt, dt;

+        update_state(cpi, get_block_context(x, subsize), subsize, 0);

+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);

+        *(get_sb_index(xd, subsize)) = 1;

+        pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize,

+                      get_block_context(x, subsize));

+        r += rt;

+        d += dt;

-      xd->sb_index = i;

-      encode_sb(cpi, mb_row + 2 * y_idx, mb_col + 2 * x_idx, 1, tp,

-                is_sb[i]);

-    }

+      set_partition_seg_context(cm, xd, mi_row, mi_col);

+      pl = partition_plane_context(xd, bsize);

+      r += x->partition_cost[pl][PARTITION_HORZ];

+      break;

+    case PARTITION_VERT:

+      *(get_sb_index(xd, subsize)) = 0;

+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,

+                    get_block_context(x, subsize));

+      if (mi_col + (bs >> 1) <= cm->mi_cols) {

+        int rt, dt;

+        update_state(cpi, get_block_context(x, subsize), subsize, 0);

+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);

+        *(get_sb_index(xd, subsize)) = 1;

+        pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize,

+                      get_block_context(x, subsize));

+        r += rt;

+        d += dt;

+      }

+      set_partition_seg_context(cm, xd, mi_row, mi_col);

+      pl = partition_plane_context(xd, bsize);

+      r += x->partition_cost[pl][PARTITION_VERT];

+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

+      break;

+    case PARTITION_SPLIT:

+      for (i = 0; i < 4; i++) {

+        int x_idx = (i & 1) * (bs >> 2);

+        int y_idx = (i >> 1) * (bs >> 2);

+        int jj = i >> 1, ii = i & 0x01;

+        int rt, dt;

+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))

+          continue;

+        *(get_sb_index(xd, subsize)) = i;

+        rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,

+                         mi_col + x_idx, subsize, &rt, &dt);

+        r += rt;

+        d += dt;

+      }

+      set_partition_seg_context(cm, xd, mi_row, mi_col);

+      pl = partition_plane_context(xd, bsize);

+      r += x->partition_cost[pl][PARTITION_SPLIT];

+      break;

+    default:

+      assert(0);

+  // update partition context

+#if CONFIG_AB4X4

+  if (bsize >= BLOCK_SIZE_SB8X8 &&

+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {

+#else

+  if (bsize > BLOCK_SIZE_SB8X8

+      && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {

+#endif

+    set_partition_seg_context(cm, xd, mi_row, mi_col);

+    update_partition_context(xd, subsize, bsize);

+  }

+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

+  if (r < INT_MAX && d < INT_MAX)

+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);

+  *rate = r;

+  *dist = d;

-static void encode_sb_row(VP9_COMP *cpi,

-                          int mb_row,

-                          TOKENEXTRA **tp,

-                          int *totalrate) {

+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are

+// unlikely to be selected depending on previously rate-distortion optimization

+// results, for encoding speed-up.

+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,

+                              int mi_row, int mi_col,

+                              BLOCK_SIZE_TYPE bsize,

+                              int *rate, int *dist) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  int mb_col;

+  int bsl = b_width_log2(bsize), bs = 1 << bsl;

+  int ms = bs / 2;

+  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];

+  PARTITION_CONTEXT sl[8], sa[8];

+  TOKENEXTRA *tp_orig = *tp;

+  int i, pl;

+  BLOCK_SIZE_TYPE subsize;

+  int srate = INT_MAX, sdist = INT_MAX;

-  // Initialize the left context for the new SB row

-  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));

+  if (bsize < BLOCK_SIZE_SB8X8)

+    if (xd->ab_index != 0) {

+      *rate = 0;

+      *dist = 0;

+      return;

+    }

+  assert(mi_height_log2(bsize) == mi_width_log2(bsize));

-  // Code each SB in the row

-  for (mb_col = cm->cur_tile_mb_col_start;

-       mb_col < cm->cur_tile_mb_col_end; mb_col += 4) {

-    int i;

-    int sb32_rate = 0, sb32_dist = 0;

-    int is_sb[4];

-    int sb64_rate = INT_MAX, sb64_dist;

-    int sb64_skip = 0;

-    ENTROPY_CONTEXT_PLANES l[4], a[4];

-    TOKENEXTRA *tp_orig = *tp;

+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

-    memcpy(&a, cm->above_context + mb_col, sizeof(a));

-    memcpy(&l, cm->left_context, sizeof(l));

-    for (i = 0; i < 4; i++) {

-      const int x_idx = (i & 1) << 1, y_idx = i & 2;

-      int mb_rate = 0, mb_dist = 0;

-      int sb_rate = INT_MAX, sb_dist;

-      int splitmodes_used = 0;

-      int sb32_skip = 0;

+  // PARTITION_SPLIT

+  if (bsize >= BLOCK_SIZE_SB8X8) {

+    int r4 = 0, d4 = 0;

+    subsize = get_subsize(bsize, PARTITION_SPLIT);

+    *(get_sb_partitioning(x, bsize)) = subsize;

-      if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols)

+    for (i = 0; i < 4; ++i) {

+      int x_idx = (i & 1) * (ms >> 1);

+      int y_idx = (i >> 1) * (ms >> 1);

+      int r = 0, d = 0;

+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))

         continue;

-      xd->sb_index = i;

+      *(get_sb_index(xd, subsize)) = i;

+      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,

+                        &r, &d);

-      splitmodes_used = pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,

-                                      tp, &mb_rate, &mb_dist);

+      r4 += r;

+      d4 += d;

+    }

+    set_partition_seg_context(cm, xd, mi_row, mi_col);

+    pl = partition_plane_context(xd, bsize);

+    if (r4 < INT_MAX)

+      r4 += x->partition_cost[pl][PARTITION_SPLIT];

+    assert(r4 >= 0);

+    assert(d4 >= 0);

+    srate = r4;

+    sdist = d4;

+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

+  }

-      mb_rate += vp9_cost_bit(cm->sb32_coded, 0);

+  // PARTITION_HORZ

+  if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {

+    int r2, d2;

+    int r = 0, d = 0;

+    subsize = get_subsize(bsize, PARTITION_HORZ);

+    *(get_sb_index(xd, subsize)) = 0;

+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,

+                  get_block_context(x, subsize));

-      if (cpi->sf.splitmode_breakout) {

-        sb32_skip = splitmodes_used;

-        sb64_skip += splitmodes_used;

-      }

+    if (mi_row + (ms >> 1) < cm->mi_rows) {

+      update_state(cpi, get_block_context(x, subsize), subsize, 0);

+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);

-      if ( !sb32_skip &&

-           !(((cm->mb_cols & 1) && mb_col + x_idx == cm->mb_cols - 1) ||

-             ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {

-        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */

-        pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx,

-                      tp, &sb_rate, &sb_dist);

-        sb_rate += vp9_cost_bit(cm->sb32_coded, 1);

-      }

+      *(get_sb_index(xd, subsize)) = 1;

+      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,

+                    get_block_context(x, subsize));

+      r2 += r;

+      d2 += d;

+    }

+    set_partition_seg_context(cm, xd, mi_row, mi_col);

+    pl = partition_plane_context(xd, bsize);

+    if (r2 < INT_MAX)

+      r2 += x->partition_cost[pl][PARTITION_HORZ];

+    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <

+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {

+      srate = r2;

+      sdist = d2;

+      *(get_sb_partitioning(x, bsize)) = subsize;

+    }

+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

+  }

-      /* Decide whether to encode as a SB or 4xMBs */

-      if (sb_rate < INT_MAX &&

-          RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <

-              RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {

-        is_sb[i] = 1;

-        sb32_rate += sb_rate;

-        sb32_dist += sb_dist;

-      } else {

-        is_sb[i] = 0;

-        sb32_rate += mb_rate;

-        sb32_dist += mb_dist;

+  // PARTITION_VERT

+  if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {

+    int r2, d2;

+    subsize = get_subsize(bsize, PARTITION_VERT);

+    *(get_sb_index(xd, subsize)) = 0;

+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,

+                  get_block_context(x, subsize));

+    if (mi_col + (ms >> 1) < cm->mi_cols) {

+      int r = 0, d = 0;

+      update_state(cpi, get_block_context(x, subsize), subsize, 0);

+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);

-        // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).

-        if (cpi->sf.mb16_breakout) {

-          ++sb64_skip;

-        }

-      }

-      /* Encode SB using best computed mode(s) */

-      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb

-      // for each level that we go up, we can just keep tokens and recon

-      // pixels of the lower level; also, inverting SB/MB order (big->small

-      // instead of small->big) means we can use as threshold for small, which

-      // may enable breakouts if RD is not good enough (i.e. faster)

-      encode_sb(cpi, mb_row + y_idx, mb_col + x_idx, 0, tp, is_sb[i]);

+      *(get_sb_index(xd, subsize)) = 1;

+      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,

+                    get_block_context(x, subsize));

+      r2 += r;

+      d2 += d;

+    set_partition_seg_context(cm, xd, mi_row, mi_col);

+    pl = partition_plane_context(xd, bsize);

+    if (r2 < INT_MAX)

+      r2 += x->partition_cost[pl][PARTITION_VERT];

+    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <

+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {

+      srate = r2;

+      sdist = d2;

+      *(get_sb_partitioning(x, bsize)) = subsize;

+    }

+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

+  }

-    memcpy(cm->above_context + mb_col, &a, sizeof(a));

-    memcpy(cm->left_context, &l, sizeof(l));

-    sb32_rate += vp9_cost_bit(cm->sb64_coded, 0);

-    if (!sb64_skip &&

-        !(((cm->mb_cols & 3) && mb_col + 3 >= cm->mb_cols) ||

-          ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) {

-      pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist);

-      sb64_rate += vp9_cost_bit(cm->sb64_coded, 1);

+  // PARTITION_NONE

+  if ((mi_row + (ms >> 1) < cm->mi_rows) &&

+      (mi_col + (ms >> 1) < cm->mi_cols)) {

+    int r, d;

+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,

+                  get_block_context(x, bsize));

+    if (bsize >= BLOCK_SIZE_SB8X8) {

+      set_partition_seg_context(cm, xd, mi_row, mi_col);

+      pl = partition_plane_context(xd, bsize);

+      r += x->partition_cost[pl][PARTITION_NONE];

-    /* Decide whether to encode as a SB or 4xMBs */

-    if (sb64_rate < INT_MAX &&

-        RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist) <

-            RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {

-      is_sb[0] = 2;

-      *totalrate += sb64_rate;

-    } else {

-      *totalrate += sb32_rate;

+    if (RDCOST(x->rdmult, x->rddiv, r, d) <

+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {

+      srate = r;

+      sdist = d;

+      if (bsize >= BLOCK_SIZE_SB8X8)

+        *(get_sb_partitioning(x, bsize)) = bsize;

+  }

-    assert(tp_orig == *tp);

-    encode_sb64(cpi, mb_row, mb_col, tp, is_sb);

+  *rate = srate;

+  *dist = sdist;

+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);

+  if (srate < INT_MAX && sdist < INT_MAX)

+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);

+  if (bsize == BLOCK_SIZE_SB64X64) {

     assert(tp_orig < *tp);

+    assert(srate < INT_MAX);

+    assert(sdist < INT_MAX);

+  } else {

+    assert(tp_orig == *tp);

+static void encode_sb_row(VP9_COMP *cpi, int mi_row,

+                       TOKENEXTRA **tp, int *totalrate) {

+  VP9_COMMON *const cm = &cpi->common;

+  int mi_col;

+  // Initialize the left context for the new SB row

+  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));

+  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));

+  // Code each SB in the row

+  for (mi_col = cm->cur_tile_mi_col_start;

+       mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) {

+    int dummy_rate, dummy_dist;

+    if (cpi->speed < 5) {

+      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,

+                        &dummy_rate, &dummy_dist);

+    } else {

+      const int idx_str = cm->mode_info_stride * mi_row + mi_col;

+      MODE_INFO *m = cm->mi + idx_str;

+      // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64);

+      choose_partitioning(cpi, cm->mi, mi_row, mi_col);

+      rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,

+                       &dummy_rate, &dummy_dist);

+    }

+  }

+}

 static void init_encode_frame_mb_context(VP9_COMP *cpi) {

   MACROBLOCK *const x = &cpi->mb;

   VP9_COMMON *const cm = &cpi->common;

@@ -1163,7 +1421,6 @@

   x->act_zbin_adj = 0;

   cpi->seg0_idx = 0;

-  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));

   xd->mode_info_stride = cm->mode_info_stride;

   xd->frame_type = cm->frame_type;

@@ -1176,42 +1433,39 @@

     vp9_init_mbmode_probs(cm);

   // Copy data over into macro block data structures.

-  x->src = *cpi->Source;

-  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];

-  xd->dst = cm->yv12_fb[cm->new_fb_idx];

+  vp9_setup_src_planes(x, cpi->Source, 0, 0);

-  // set up frame for intra coded blocks

-  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);

+  // TODO(jkoleszar): are these initializations required?

+  setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,

+                   0, 0, NULL, NULL);

+  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);

   vp9_build_block_offsets(x);

-  vp9_setup_block_dptrs(&x->e_mbd);

+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);

-  vp9_setup_block_ptrs(x);

   xd->mode_info_context->mbmi.mode = DC_PRED;

   xd->mode_info_context->mbmi.uv_mode = DC_PRED;

-  vp9_zero(cpi->count_mb_ref_frame_usage)

-  vp9_zero(cpi->bmode_count)

-  vp9_zero(cpi->ymode_count)

-  vp9_zero(cpi->i8x8_mode_count)

+  vp9_zero(cpi->y_mode_count)

   vp9_zero(cpi->y_uv_mode_count)

-  vp9_zero(cpi->sub_mv_ref_count)

-  vp9_zero(cpi->mbsplit_count)

-  vp9_zero(cpi->common.fc.mv_ref_ct)

-  vp9_zero(cpi->sb_ymode_count)

-  vp9_zero(cpi->sb32_count);

-  vp9_zero(cpi->sb64_count);

-#if CONFIG_COMP_INTERINTRA_PRED

-  vp9_zero(cpi->interintra_count);

-  vp9_zero(cpi->interintra_select_count);

-#endif

+  vp9_zero(cm->fc.inter_mode_counts)

+  vp9_zero(cpi->partition_count);

+  vp9_zero(cpi->intra_inter_count);

+  vp9_zero(cpi->comp_inter_count);

+  vp9_zero(cpi->single_ref_count);

+  vp9_zero(cpi->comp_ref_count);

+  vp9_zero(cm->fc.tx_count_32x32p);

+  vp9_zero(cm->fc.tx_count_16x16p);

+  vp9_zero(cm->fc.tx_count_8x8p);

+  vp9_zero(cm->fc.mbskip_count);

-  vpx_memset(cm->above_context, 0,

-             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

-  xd->fullpixel_mask = cm->full_pixel ? 0xfffffff8 : 0xffffffff;

+  // Note: this memset assumes above_context[0], [1] and [2]

+  // are allocated as part of the same buffer.

+  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *

+                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));

+  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *

+                                       mi_cols_aligned_to_sb(cm));

 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {

@@ -1218,37 +1472,32 @@

   if (lossless) {

     cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;

     cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;

     cpi->mb.optimize              = 0;

     cpi->common.filter_level      = 0;

-    cpi->zbin_mode_boost_enabled  = FALSE;

+    cpi->zbin_mode_boost_enabled  = 0;

     cpi->common.txfm_mode         = ONLY_4X4;

   } else {

     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;

     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;

 static void encode_frame_internal(VP9_COMP *cpi) {

-  int mb_row;

+  int mi_row;

   MACROBLOCK *const x = &cpi->mb;

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

   int totalrate;

-//   fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",

-//            cpi->common.current_video_frame, cpi->common.show_frame,

-//            cm->frame_type);

+//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",

+//           cpi->common.current_video_frame, cpi->common.show_frame,

+//           cm->frame_type);

-  // Compute a modified set of reference frame probabilities to use when

-  // prediction fails. These are based on the current general estimates for

-  // this frame which may be updated with each iteration of the recode loop.

-  vp9_compute_mod_refprobs(cm);

 // debug output

 #if DBG_PRNT_SEGMAP

@@ -1264,10 +1513,7 @@

   // Reset frame count of inter 0,0 motion vector usage.

   cpi->inter_zz_count = 0;

-  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;

-  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;

-  vp9_zero(cpi->switchable_interp_count);

+  vp9_zero(cm->fc.switchable_interp_count);

   vp9_zero(cpi->best_switchable_interp_count);

   xd->mode_info_context = cm->mi;

@@ -1274,31 +1520,18 @@

   xd->prev_mode_info_context = cm->prev_mi;

   vp9_zero(cpi->NMVcount);

-  vp9_zero(cpi->coef_counts_4x4);

-  vp9_zero(cpi->coef_counts_8x8);

-  vp9_zero(cpi->coef_counts_16x16);

-  vp9_zero(cpi->coef_counts_32x32);

+  vp9_zero(cpi->coef_counts);

   vp9_zero(cm->fc.eob_branch_counts);

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_zero(cm->fc.nzc_counts_4x4);

-  vp9_zero(cm->fc.nzc_counts_8x8);

-  vp9_zero(cm->fc.nzc_counts_16x16);

-  vp9_zero(cm->fc.nzc_counts_32x32);

-  vp9_zero(cm->fc.nzc_pcat_counts);

-#endif

-#if CONFIG_NEW_MVREF

-  vp9_zero(cpi->mb_mv_ref_count);

-#endif

-  cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&

-                            cm->y1dc_delta_q == 0 &&

-                            cm->uvdc_delta_q == 0 &&

-                            cm->uvac_delta_q == 0);

+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&

+                           cm->y_dc_delta_q == 0 &&

+                           cm->uv_dc_delta_q == 0 &&

+                           cm->uv_ac_delta_q == 0;

   switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);

   vp9_frame_init_quantizer(cpi);

-  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

+  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);

   vp9_initialize_me_consts(cpi, cm->base_qindex);

   if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

@@ -1313,12 +1546,11 @@

   init_encode_frame_mb_context(cpi);

   vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));

-  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));

-  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));

-  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));

-  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));

-  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));

   vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));

+  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));

+  set_prev_mi(cm);

     struct vpx_usec_timer  emr_timer;

     vpx_usec_timer_start(&emr_timer);

@@ -1336,11 +1568,13 @@

           // For each row of SBs in the frame

           vp9_get_tile_col_offsets(cm, tile_col);

-          for (mb_row = cm->cur_tile_mb_row_start;

-               mb_row < cm->cur_tile_mb_row_end; mb_row += 4) {

-            encode_sb_row(cpi, mb_row, &tp, &totalrate);

-          }

+          for (mi_row = cm->cur_tile_mi_row_start;

+               mi_row < cm->cur_tile_mi_row_end;

+               mi_row += 8)

+            encode_sb_row(cpi, mi_row, &tp, &totalrate);

           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);

+          assert(tp - cpi->tok <=

+                 get_token_alloc(cm->mb_rows, cm->mb_cols));

@@ -1365,15 +1599,6 @@

   int ref_flags = cpi->ref_frame_flags;

   if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {

-    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&

-        vp9_check_segref(xd, 1, LAST_FRAME))

-      return 1;

-    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&

-        vp9_check_segref(xd, 1, GOLDEN_FRAME))

-      return 1;

-    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&

-        vp9_check_segref(xd, 1, ALTREF_FRAME))

-      return 1;

     return 0;

   } else {

     return (!!(ref_flags & VP9_GOLD_FLAG) +

@@ -1382,23 +1607,6 @@

-static void reset_skip_txfm_size_mb(VP9_COMP *cpi,

-                                    MODE_INFO *mi, TX_SIZE txfm_max) {

-  MB_MODE_INFO *const mbmi = &mi->mbmi;

-  if (mbmi->txfm_size > txfm_max) {

-    VP9_COMMON *const cm = &cpi->common;

-    MACROBLOCK *const x = &cpi->mb;

-    MACROBLOCKD *const xd = &x->e_mbd;

-    const int segment_id = mbmi->segment_id;

-    xd->mode_info_context = mi;

-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||

-           (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));

-    mbmi->txfm_size = txfm_max;

-  }

-}

 static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {

   int x, y;

@@ -1422,96 +1630,120 @@

-static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi,

-                                      int mis, TX_SIZE txfm_max,

-                                      int mb_rows_left, int mb_cols_left) {

+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,

+                                   int mis, TX_SIZE txfm_max,

+                                   int bw, int bh, int mi_row, int mi_col,

+                                   BLOCK_SIZE_TYPE bsize) {

+  VP9_COMMON *const cm = &cpi->common;

   MB_MODE_INFO *const mbmi = &mi->mbmi;

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

+    return;

   if (mbmi->txfm_size > txfm_max) {

-    VP9_COMMON *const cm = &cpi->common;

     MACROBLOCK *const x = &cpi->mb;

     MACROBLOCKD *const xd = &x->e_mbd;

     const int segment_id = mbmi->segment_id;

-    const int ymbs = MIN(2, mb_rows_left);

-    const int xmbs = MIN(2, mb_cols_left);

+    const int ymbs = MIN(bh, cm->mi_rows - mi_row);

+    const int xmbs = MIN(bw, cm->mi_cols - mi_col);

     xd->mode_info_context = mi;

-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||

-           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));

+    assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||

+           get_skip_flag(mi, mis, ymbs, xmbs));

     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);

-static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi,

-                                      int mis, TX_SIZE txfm_max,

-                                      int mb_rows_left, int mb_cols_left) {

-  MB_MODE_INFO *const mbmi = &mi->mbmi;

+static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,

+                                    TX_SIZE txfm_max,

+                                    int mi_row, int mi_col,

+                                    BLOCK_SIZE_TYPE bsize) {

+  VP9_COMMON *const cm = &cpi->common;

+  const int mis = cm->mode_info_stride;

+  int bwl, bhl;

+  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);

-  if (mbmi->txfm_size > txfm_max) {

-    VP9_COMMON *const cm = &cpi->common;

-    MACROBLOCK *const x = &cpi->mb;

-    MACROBLOCKD *const xd = &x->e_mbd;

-    const int segment_id = mbmi->segment_id;

-    const int ymbs = MIN(4, mb_rows_left);

-    const int xmbs = MIN(4, mb_cols_left);

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

+    return;

-    xd->mode_info_context = mi;

-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||

-           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));

-    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);

+  bwl = mi_width_log2(mi->mbmi.sb_type);

+  bhl = mi_height_log2(mi->mbmi.sb_type);

+  if (bwl == bsl && bhl == bsl) {

+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,

+                           mi_row, mi_col, bsize);

+  } else if (bwl == bsl && bhl < bsl) {

+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,

+                           mi_row, mi_col, bsize);

+    reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,

+                           mi_row + bs, mi_col, bsize);

+  } else if (bwl < bsl && bhl == bsl) {

+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,

+                           mi_row, mi_col, bsize);

+    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,

+                           mi_row, mi_col + bs, bsize);

+  } else {

+    BLOCK_SIZE_TYPE subsize;

+    int n;

+    assert(bwl < bsl && bhl < bsl);

+    if (bsize == BLOCK_SIZE_SB64X64) {

+      subsize = BLOCK_SIZE_SB32X32;

+    } else if (bsize == BLOCK_SIZE_SB32X32) {

+      subsize = BLOCK_SIZE_MB16X16;

+    } else {

+      assert(bsize == BLOCK_SIZE_MB16X16);

+      subsize = BLOCK_SIZE_SB8X8;

+    }

+    for (n = 0; n < 4; n++) {

+      const int y_idx = n >> 1, x_idx = n & 0x01;

+      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,

+                              txfm_max, mi_row + y_idx * bs,

+                              mi_col + x_idx * bs, subsize);

+    }

 static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {

   VP9_COMMON *const cm = &cpi->common;

-  int mb_row, mb_col;

+  int mi_row, mi_col;

   const int mis = cm->mode_info_stride;

   MODE_INFO *mi, *mi_ptr = cm->mi;

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {

+  for (mi_row = 0; mi_row < cm->mi_rows;

+       mi_row += 8, mi_ptr += 8 * mis) {

     mi = mi_ptr;

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {

-      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-        reset_skip_txfm_size_sb64(cpi, mi, mis, txfm_max,

-                                  cm->mb_rows - mb_row, cm->mb_cols - mb_col);

-      } else {

-        int i;

-        for (i = 0; i < 4; i++) {

-          const int x_idx_sb = (i & 1) << 1, y_idx_sb = i & 2;

-          MODE_INFO *sb_mi = mi + y_idx_sb * mis + x_idx_sb;

-          if (mb_row + y_idx_sb >= cm->mb_rows ||

-              mb_col + x_idx_sb >= cm->mb_cols)

-            continue;

-          if (sb_mi->mbmi.sb_type) {

-            reset_skip_txfm_size_sb32(cpi, sb_mi, mis, txfm_max,

-                                      cm->mb_rows - mb_row - y_idx_sb,

-                                      cm->mb_cols - mb_col - x_idx_sb);

-          } else {

-            int m;

-            for (m = 0; m < 4; m++) {

-              const int x_idx = x_idx_sb + (m & 1), y_idx = y_idx_sb + (m >> 1);

-              MODE_INFO *mb_mi;

-              if (mb_col + x_idx >= cm->mb_cols ||

-                  mb_row + y_idx >= cm->mb_rows)

-                continue;

-              mb_mi = mi + y_idx * mis + x_idx;

-              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);

-              reset_skip_txfm_size_mb(cpi, mb_mi, txfm_max);

-            }

-          }

-        }

-      }

+    for (mi_col = 0; mi_col < cm->mi_cols;

+         mi_col += 8, mi += 8) {

+      reset_skip_txfm_size_sb(cpi, mi, txfm_max,

+                              mi_row, mi_col, BLOCK_SIZE_SB64X64);

 void vp9_encode_frame(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  // In the longer term the encoder should be generalized to match the

+  // decoder such that we allow compound where one of the 3 buffers has a

+  // differnt sign bias and that buffer is then the fixed ref. However, this

+  // requires further work in the rd loop. For now the only supported encoder

+  // side behaviour is where the ALT ref buffer has oppositie sign bias to

+  // the other two.

+  if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==

+       cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||

+      (cm->ref_frame_sign_bias[ALTREF_FRAME] ==

+       cm->ref_frame_sign_bias[LAST_FRAME])) {

+    cm->allow_comp_inter_inter = 0;

+  } else {

+    cm->allow_comp_inter_inter = 1;

+    cm->comp_fixed_ref = ALTREF_FRAME;

+    cm->comp_var_ref[0] = LAST_FRAME;

+    cm->comp_var_ref[1] = GOLDEN_FRAME;

+  }

   if (cpi->sf.RD) {

     int i, frame_type, pred_type;

     TXFM_MODE txfm_type;

@@ -1535,7 +1767,7 @@

       frame_type = 2;

     /* prediction (compound, single or hybrid) mode selection */

-    if (frame_type == 3)

+    if (frame_type == 3 || !cm->allow_comp_inter_inter)

       pred_type = SINGLE_PREDICTION_ONLY;

     else if (cpi->rd_prediction_type_threshes[frame_type][1] >

                  cpi->rd_prediction_type_threshes[frame_type][0] &&

@@ -1584,15 +1816,11 @@

     } else

       txfm_type = ALLOW_8X8;

 #else

-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=

+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >

                   cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?

                     ALLOW_32X32 : TX_MODE_SELECT;

 #endif

     cpi->common.txfm_mode = txfm_type;

-    if (txfm_type != TX_MODE_SELECT) {

-      cpi->common.prob_tx[0] = 128;

-      cpi->common.prob_tx[1] = 128;

-    }

     cpi->common.comp_pred_mode = pred_type;

     encode_frame_internal(cpi);

@@ -1617,29 +1845,50 @@

       int single_count_zero = 0;

       int comp_count_zero = 0;

-      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {

-        single_count_zero += cpi->single_pred_count[i];

-        comp_count_zero += cpi->comp_pred_count[i];

+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {

+        single_count_zero += cpi->comp_inter_count[i][0];

+        comp_count_zero += cpi->comp_inter_count[i][1];

       if (comp_count_zero == 0) {

         cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;

+        vp9_zero(cpi->comp_inter_count);

       } else if (single_count_zero == 0) {

         cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;

+        vp9_zero(cpi->comp_inter_count);

     if (cpi->common.txfm_mode == TX_MODE_SELECT) {

-      const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] +

-                           cpi->txfm_count_32x32p[TX_4X4] +

-                           cpi->txfm_count_8x8p[TX_4X4];

-      const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] +

-                              cpi->txfm_count_16x16p[TX_8X8];

-      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];

-      const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16];

-      const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16];

-      const int count32x32 = cpi->txfm_count_32x32p[TX_32X32];

+      int count4x4 = 0;

+      int count8x8_lp = 0, count8x8_8x8p = 0;

+      int count16x16_16x16p = 0, count16x16_lp = 0;

+      int count32x32 = 0;

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count4x4 += cm->fc.tx_count_32x32p[i][TX_4X4];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count4x4 += cm->fc.tx_count_16x16p[i][TX_4X4];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count4x4 += cm->fc.tx_count_8x8p[i][TX_4X4];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count8x8_lp += cm->fc.tx_count_32x32p[i][TX_8X8];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count8x8_lp += cm->fc.tx_count_16x16p[i][TX_8X8];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count8x8_8x8p += cm->fc.tx_count_8x8p[i][TX_8X8];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count16x16_16x16p += cm->fc.tx_count_16x16p[i][TX_16X16];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count16x16_lp += cm->fc.tx_count_32x32p[i][TX_16X16];

+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)

+        count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32];

       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&

           count32x32 == 0) {

         cpi->common.txfm_mode = ALLOW_8X8;

@@ -1665,70 +1914,7 @@

-void vp9_setup_block_ptrs(MACROBLOCK *x) {

-  int r, c;

-  int i;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

-      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++)

-      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++)

-      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;

-  }

-  for (i = 0; i < 24; i++)

-    x->block[i].coeff = x->coeff + i * 16;

-}

 void vp9_build_block_offsets(MACROBLOCK *x) {

-  int block = 0;

-  int br, bc;

-  vp9_build_block_doffsets(&x->e_mbd);

-  for (br = 0; br < 4; br++) {

-    for (bc = 0; bc < 4; bc++) {

-      BLOCK *this_block = &x->block[block];

-      // this_block->base_src = &x->src.y_buffer;

-      // this_block->src_stride = x->src.y_stride;

-      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;

-      this_block->base_src = &x->src.y_buffer;

-      this_block->src_stride = x->src.y_stride;

-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;

-      ++block;

-    }

-  }

-  // u blocks

-  for (br = 0; br < 2; br++) {

-    for (bc = 0; bc < 2; bc++) {

-      BLOCK *this_block = &x->block[block];

-      this_block->base_src = &x->src.u_buffer;

-      this_block->src_stride = x->src.uv_stride;

-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;

-      ++block;

-    }

-  }

-  // v blocks

-  for (br = 0; br < 2; br++) {

-    for (bc = 0; bc < 2; bc++) {

-      BLOCK *this_block = &x->block[block];

-      this_block->base_src = &x->src.v_buffer;

-      this_block->src_stride = x->src.uv_stride;

-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;

-      ++block;

-    }

-  }

 static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {

@@ -1736,53 +1922,23 @@

   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;

   const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;

-#ifdef MODE_STATS

-  const int is_key = cpi->common.frame_type == KEY_FRAME;

-  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];

-  ++ uv_modes_y[m][uvm];

-  if (m == B_PRED) {

-    unsigned int *const bct = is_key ? b_modes : inter_b_modes;

-    int b = 0;

-    do {

-      ++ bct[xd->block[b].bmi.as_mode.first];

-    } while (++b < 16);

-  }

-  if (m == I8X8_PRED) {

-    i8x8_modes[xd->block[0].bmi.as_mode.first]++;

-    i8x8_modes[xd->block[2].bmi.as_mode.first]++;

-    i8x8_modes[xd->block[8].bmi.as_mode.first]++;

-    i8x8_modes[xd->block[10].bmi.as_mode.first]++;

-  }

-#endif

-  if (xd->mode_info_context->mbmi.sb_type) {

-    ++cpi->sb_ymode_count[m];

+  ++cpi->y_uv_mode_count[m][uvm];

+  if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {

+    const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;

+    const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

+    const int bsl = MIN(bwl, bhl);

+    ++cpi->y_mode_count[MIN(bsl, 3)][m];

   } else {

-    ++cpi->ymode_count[m];

+    int idx, idy;

+    int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);

+    int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);

+    for (idy = 0; idy < 2; idy += bh) {

+      for (idx = 0; idx < 2; idx += bw) {

+        int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first;

+        ++cpi->y_mode_count[0][m];

+      }

+    }

-  if (m != I8X8_PRED)

-    ++cpi->y_uv_mode_count[m][uvm];

-  else {

-    cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;

-    cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;

-    cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;

-    cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;

-  }

-  if (m == B_PRED) {

-    int b = 0;

-    do {

-      int m = xd->block[b].bmi.as_mode.first;

-#if CONFIG_NEWBINTRAMODES

-      if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;

-#endif

-      ++cpi->bmode_count[m];

-    } while (++b < 16);

-  }

 // Experimental stub function to create a per MB zbin adjustment based on

@@ -1806,268 +1962,22 @@

 #endif

-static void update_sb64_skip_coeff_state(VP9_COMP *cpi,

-                                         ENTROPY_CONTEXT_PLANES ta[16],

-                                         ENTROPY_CONTEXT_PLANES tl[16],

-                                         TOKENEXTRA *t[16],

-                                         TOKENEXTRA **tp,

-                                         int skip[16], int output_enabled) {

-  MACROBLOCK *const x = &cpi->mb;

-  if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_32X32) {

-    TOKENEXTRA tokens[4][1024+512];

-    int n_tokens[4], n;

-    // if there were no skips, we don't need to do anything

-    if (!skip[0] && !skip[1] && !skip[2] && !skip[3])

-      return;

-    // if we don't do coeff skipping for this frame, we don't

-    // need to do anything here

-    if (!cpi->common.mb_no_coeff_skip)

-      return;

-    // if all 4 MBs skipped coeff coding, nothing to be done

-    if (skip[0] && skip[1] && skip[2] && skip[3])

-      return;

-    // so the situation now is that we want to skip coeffs

-    // for some MBs, but not all, and we didn't code EOB

-    // coefficients for them. However, the skip flag for this

-    // SB will be 0 overall, so we need to insert EOBs in the

-    // middle of the token tree. Do so here.

-    for (n = 0; n < 4; n++) {

-      if (n < 3) {

-        n_tokens[n] = t[n + 1] - t[n];

-      } else {

-        n_tokens[n] = *tp - t[3];

-      }

-      if (n_tokens[n]) {

-        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));

-      }

-    }

-    // reset pointer, stuff EOBs where necessary

-    *tp = t[0];

-    for (n = 0; n < 4; n++) {

-      if (skip[n]) {

-        x->e_mbd.above_context = &ta[n * 2];

-        x->e_mbd.left_context  = &tl[n * 2];

-        vp9_stuff_sb(cpi, &x->e_mbd, tp, !output_enabled);

-      } else {

-        if (n_tokens[n]) {

-          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);

-        }

-        (*tp) += n_tokens[n];

-      }

-    }

-  } else {

-    TOKENEXTRA tokens[16][16 * 25];

-    int n_tokens[16], n;

-    // if there were no skips, we don't need to do anything

-    if (!skip[ 0] && !skip[ 1] && !skip[ 2] && !skip[ 3] &&

-        !skip[ 4] && !skip[ 5] && !skip[ 6] && !skip[ 7] &&

-        !skip[ 8] && !skip[ 9] && !skip[10] && !skip[11] &&

-        !skip[12] && !skip[13] && !skip[14] && !skip[15])

-      return;

-    // if we don't do coeff skipping for this frame, we don't

-    // need to do anything here

-    if (!cpi->common.mb_no_coeff_skip)

-      return;

-    // if all 4 MBs skipped coeff coding, nothing to be done

-    if (skip[ 0] && skip[ 1] && skip[ 2] && skip[ 3] &&

-        skip[ 4] && skip[ 5] && skip[ 6] && skip[ 7] &&

-        skip[ 8] && skip[ 9] && skip[10] && skip[11] &&

-        skip[12] && skip[13] && skip[14] && skip[15])

-      return;

-    // so the situation now is that we want to skip coeffs

-    // for some MBs, but not all, and we didn't code EOB

-    // coefficients for them. However, the skip flag for this

-    // SB will be 0 overall, so we need to insert EOBs in the

-    // middle of the token tree. Do so here.

-    for (n = 0; n < 16; n++) {

-      if (n < 15) {

-        n_tokens[n] = t[n + 1] - t[n];

-      } else {

-        n_tokens[n] = *tp - t[15];

-      }

-      if (n_tokens[n]) {

-        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));

-      }

-    }

-    // reset pointer, stuff EOBs where necessary

-    *tp = t[0];

-    for (n = 0; n < 16; n++) {

-      if (skip[n]) {

-        x->e_mbd.above_context = &ta[n];

-        x->e_mbd.left_context  = &tl[n];

-        vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);

-      } else {

-        if (n_tokens[n]) {

-          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);

-        }

-        (*tp) += n_tokens[n];

-      }

-    }

-  }

-}

-#if CONFIG_CODE_NONZEROCOUNT

-static void gather_nzcs_mb16(VP9_COMMON *const cm,

-                             MACROBLOCKD *xd) {

-  int i;

-  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,

-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));

-  switch (xd->mode_info_context->mbmi.txfm_size) {

-    case TX_4X4:

-      for (i = 0; i < 24; ++i) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    case TX_8X8:

-      for (i = 0; i < 16; i += 4) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-          xd->mode_info_context->mbmi.mode == SPLITMV) {

-        for (i = 16; i < 24; ++i) {

-          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-        }

-      } else {

-        for (i = 16; i < 24; i += 4) {

-          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-        }

-      }

-      break;

-    case TX_16X16:

-      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];

-      for (i = 16; i < 24; i += 4) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    default:

-      break;

-  }

-}

-static void gather_nzcs_sb32(VP9_COMMON *const cm,

-                             MACROBLOCKD *xd) {

-  int i, j;

-  MODE_INFO *m = xd->mode_info_context;

-  int mis = cm->mode_info_stride;

-  vpx_memset(m->mbmi.nzcs, 0,

-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));

-  switch (xd->mode_info_context->mbmi.txfm_size) {

-    case TX_4X4:

-      for (i = 0; i < 96; ++i) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    case TX_8X8:

-      for (i = 0; i < 96; i += 4) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    case TX_16X16:

-      for (i = 0; i < 96; i += 16) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    case TX_32X32:

-      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];

-      for (i = 64; i < 96; i += 16) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    default:

-      break;

-  }

-  for (i = 0; i < 2; ++i)

-    for (j = 0; j < 2; ++j) {

-      if (i == 0 && j == 0) continue;

-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,

-                 384 * sizeof(m->mbmi.nzcs[0]));

-    }

-}

-static void gather_nzcs_sb64(VP9_COMMON *const cm,

-                             MACROBLOCKD *xd) {

-  int i, j;

-  MODE_INFO *m = xd->mode_info_context;

-  int mis = cm->mode_info_stride;

-  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,

-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));

-  switch (xd->mode_info_context->mbmi.txfm_size) {

-    case TX_4X4:

-      for (i = 0; i < 384; ++i) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    case TX_8X8:

-      for (i = 0; i < 384; i += 4) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    case TX_16X16:

-      for (i = 0; i < 384; i += 16) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    case TX_32X32:

-      for (i = 0; i < 384; i += 64) {

-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

-      }

-      break;

-    default:

-      break;

-  }

-  for (i = 0; i < 4; ++i)

-    for (j = 0; j < 4; ++j) {

-      if (i == 0 && j == 0) continue;

-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,

-                 384 * sizeof(m->mbmi.nzcs[0]));

-    }

-}

-#endif

-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,

-                              int output_enabled,

-                              int mb_row, int mb_col) {

+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,

+                              int output_enabled, int mi_row, int mi_col,

+                              BLOCK_SIZE_TYPE bsize) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

   MODE_INFO *mi = xd->mode_info_context;

-  MB_MODE_INFO *const mbmi = &mi->mbmi;

+  MB_MODE_INFO *mbmi = &mi->mbmi;

+  unsigned int segment_id = mbmi->segment_id;

   const int mis = cm->mode_info_stride;

-  unsigned char ref_pred_flag;

+  const int bwl = mi_width_log2(bsize);

+  const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);

-  assert(!xd->mode_info_context->mbmi.sb_type);

-#ifdef ENC_DEBUG

-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&

-               mb_row == 8 && mb_col == 0 && output_enabled);

-  if (enc_debug)

-    printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);

-#endif

   if (cm->frame_type == KEY_FRAME) {

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM && output_enabled) {

-      // Adjust the zbin based on this MB rate.

+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

       adjust_act_zbin(cpi, x);

       vp9_update_zbin_extra(cpi, x);

@@ -2083,16 +1993,17 @@

     // Increase zbin size to suppress noise

     cpi->zbin_mode_boost = 0;

     if (cpi->zbin_mode_boost_enabled) {

-      if (mbmi->ref_frame != INTRA_FRAME) {

+      if (mbmi->ref_frame[0] != INTRA_FRAME) {

         if (mbmi->mode == ZEROMV) {

-          if (mbmi->ref_frame != LAST_FRAME)

+          if (mbmi->ref_frame[0] != LAST_FRAME)

             cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

           else

             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

-        } else if (mbmi->mode == SPLITMV)

+        } else if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {

           cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;

-        else

+        } else {

           cpi->zbin_mode_boost = MV_ZBIN_BOOST;

+        }

       } else {

         cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;

@@ -2099,641 +2010,94 @@

     vp9_update_zbin_extra(cpi, x);

-    // SET VARIOUS PREDICTION FLAGS

-    // Did the chosen reference frame match its predicted value.

-    ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));

-    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);

-  if (mbmi->ref_frame == INTRA_FRAME) {

-#if 0  // def ENC_DEBUG

-    if (enc_debug) {

-      printf("Mode %d skip %d tx_size %d\n", mbmi->mode, x->skip,

-             mbmi->txfm_size);

-    }

-#endif

-    if (mbmi->mode == B_PRED) {

-      vp9_encode_intra16x16mbuv(cm, x);

-      vp9_encode_intra4x4mby(x);

-    } else if (mbmi->mode == I8X8_PRED) {

-      vp9_encode_intra8x8mby(x);

-      vp9_encode_intra8x8mbuv(x);

-    } else {

-      vp9_encode_intra16x16mbuv(cm, x);

-      vp9_encode_intra16x16mby(cm, x);

-    }

+  if (mbmi->ref_frame[0] == INTRA_FRAME) {

+    vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?

+                                    BLOCK_SIZE_SB8X8 : bsize);

+    vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?

+                                     BLOCK_SIZE_SB8X8 : bsize);

     if (output_enabled)

       sum_intra_stats(cpi, x);

   } else {

-    int ref_fb_idx;

-#ifdef ENC_DEBUG

-    if (enc_debug)

-      printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",

-             mbmi->mode, x->skip, mbmi->txfm_size,

-             mbmi->ref_frame, mbmi->second_ref_frame,

-             mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,

-             mbmi->interp_filter);

-#endif

+    int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];

+    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];

+    YV12_BUFFER_CONFIG *second_ref_fb = NULL;

+    if (mbmi->ref_frame[1] > 0) {

+      idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])];

+      second_ref_fb = &cm->yv12_fb[idx];

+    }

     assert(cm->frame_type != KEY_FRAME);

-    if (mbmi->ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

-    else if (mbmi->ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

-    else

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

+    setup_pre_planes(xd, ref_fb, second_ref_fb,

+                     mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);

-    setup_pred_block(&xd->pre,

-                     &cpi->common.yv12_fb[ref_fb_idx],

-                     mb_row, mb_col,

-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

-    if (mbmi->second_ref_frame > 0) {

-      int second_ref_fb_idx;

-      if (mbmi->second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

-      else

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-      setup_pred_block(&xd->second_pre,

-                       &cpi->common.yv12_fb[second_ref_fb_idx],

-                       mb_row, mb_col,

-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

-    }

-    if (!x->skip) {

-      vp9_encode_inter16x16(cm, x, mb_row, mb_col);

-      // Clear mb_skip_coeff if mb_no_coeff_skip is not set

-      if (!cpi->common.mb_no_coeff_skip)

-        mbmi->mb_skip_coeff = 0;

-    } else {

-      vp9_build_inter16x16_predictors_mb(xd,

-                                         xd->dst.y_buffer,

-                                         xd->dst.u_buffer,

-                                         xd->dst.v_buffer,

-                                         xd->dst.y_stride,

-                                         xd->dst.uv_stride,

-                                         mb_row, mb_col);

-#if CONFIG_COMP_INTERINTRA_PRED

-      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

-        vp9_build_interintra_16x16_predictors_mb(xd,

-                                                 xd->dst.y_buffer,

-                                                 xd->dst.u_buffer,

-                                                 xd->dst.v_buffer,

-                                                 xd->dst.y_stride,

-                                                 xd->dst.uv_stride);

-      }

-#endif

-    }

+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col,

+                                  bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8

+                                                           : bsize);

-  if (!x->skip) {

-#ifdef ENC_DEBUG

-    if (enc_debug) {

-      int i, j;

-      printf("\n");

-      printf("qcoeff\n");

-      for (i = 0; i < 384; i++) {

-        printf("%3d ", xd->qcoeff[i]);

-        if (i % 16 == 15) printf("\n");

-      }

-      printf("\n");

-      printf("predictor\n");

-      for (i = 0; i < 384; i++) {

-        printf("%3d ", xd->predictor[i]);

-        if (i % 16 == 15) printf("\n");

-      }

-      printf("\n");

-      printf("src_diff\n");

-      for (i = 0; i < 384; i++) {

-        printf("%3d ", x->src_diff[i]);

-        if (i % 16 == 15) printf("\n");

-      }

-      printf("\n");

-      printf("diff\n");

-      for (i = 0; i < 384; i++) {

-        printf("%3d ", xd->block[0].diff[i]);

-        if (i % 16 == 15) printf("\n");

-      }

-      printf("\n");

-      printf("final y\n");

-      for (i = 0; i < 16; i++) {

-        for (j = 0; j < 16; j++)

-          printf("%3d ", xd->dst.y_buffer[i * xd->dst.y_stride + j]);

-        printf("\n");

-      }

-      printf("\n");

-      printf("final u\n");

-      for (i = 0; i < 8; i++) {

-        for (j = 0; j < 8; j++)

-          printf("%3d ", xd->dst.u_buffer[i * xd->dst.uv_stride + j]);

-        printf("\n");

-      }

-      printf("\n");

-      printf("final v\n");

-      for (i = 0; i < 8; i++) {

-        for (j = 0; j < 8; j++)

-          printf("%3d ", xd->dst.v_buffer[i * xd->dst.uv_stride + j]);

-        printf("\n");

-      }

-      fflush(stdout);

-    }

-#endif

-#if CONFIG_CODE_NONZEROCOUNT

-    gather_nzcs_mb16(cm, xd);

-#endif

-    vp9_tokenize_mb(cpi, xd, t, !output_enabled);

+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {

+    vp9_tokenize_sb(cpi, xd, t, !output_enabled,

+                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);

+  } else if (!x->skip) {

+    vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);

+    vp9_tokenize_sb(cpi, xd, t, !output_enabled,

+                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);

   } else {

     // FIXME(rbultje): not tile-aware (mi - 1)

-    int mb_skip_context = cpi->common.mb_no_coeff_skip ?

-      (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;

+    int mb_skip_context =

+        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;

-    if (cm->mb_no_coeff_skip) {

-      mbmi->mb_skip_coeff = 1;

-      if (output_enabled)

-        cpi->skip_true_count[mb_skip_context]++;

-      vp9_reset_mb_tokens_context(xd);

-    } else {

-      vp9_stuff_mb(cpi, xd, t, !output_enabled);

-      mbmi->mb_skip_coeff = 0;

-      if (output_enabled)

-        cpi->skip_false_count[mb_skip_context]++;

-    }

-  }

-  if (output_enabled) {

-    int segment_id = mbmi->segment_id;

-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&

-        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||

-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP)))) {

-      assert(mbmi->txfm_size <= TX_16X16);

-      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&

-          mbmi->mode != SPLITMV) {

-        cpi->txfm_count_16x16p[mbmi->txfm_size]++;

-      } else if (mbmi->mode == I8X8_PRED ||

-                 (mbmi->mode == SPLITMV &&

-                  mbmi->partitioning != PARTITIONING_4X4)) {

-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;

-      }

-    } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&

-        mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {

-      mbmi->txfm_size = TX_16X16;

-    } else if (mbmi->mode != B_PRED &&

-               !(mbmi->mode == SPLITMV &&

-                 mbmi->partitioning == PARTITIONING_4X4) &&

-               cpi->common.txfm_mode >= ALLOW_8X8) {

-      mbmi->txfm_size = TX_8X8;

-    } else {

-      mbmi->txfm_size = TX_4X4;

-    }

-  }

-}

-static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int output_enabled, int mb_row, int mb_col) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCK *const x = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  const uint8_t *src = x->src.y_buffer;

-  uint8_t *dst = xd->dst.y_buffer;

-  const uint8_t *usrc = x->src.u_buffer;

-  uint8_t *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer;

-  uint8_t *vdst = xd->dst.v_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  unsigned char ref_pred_flag;

-  MODE_INFO *mi = x->e_mbd.mode_info_context;

-  unsigned int segment_id = mi->mbmi.segment_id;

-  const int mis = cm->mode_info_stride;

-#ifdef ENC_DEBUG

-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&

-               mb_row == 8 && mb_col == 0 && output_enabled);

-  if (enc_debug) {

-    printf("Encode SB32 %d %d output %d\n", mb_row, mb_col, output_enabled);

-    printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",

-           mi->mbmi.mode, x->skip, mi->mbmi.txfm_size,

-           mi->mbmi.ref_frame, mi->mbmi.second_ref_frame,

-           mi->mbmi.mv[0].as_mv.row, mi->mbmi.mv[0].as_mv.col,

-           mi->mbmi.interp_filter);

-  }

-#endif

-  if (cm->frame_type == KEY_FRAME) {

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

-      adjust_act_zbin(cpi, x);

-      vp9_update_zbin_extra(cpi, x);

-    }

-  } else {

-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

-      // Adjust the zbin based on this MB rate.

-      adjust_act_zbin(cpi, x);

-    }

-    // Experimental code. Special case for gf and arf zeromv modes.

-    // Increase zbin size to suppress noise

-    cpi->zbin_mode_boost = 0;

-    if (cpi->zbin_mode_boost_enabled) {

-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {

-        if (xd->mode_info_context->mbmi.mode == ZEROMV) {

-          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)

-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

-          else

-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

-        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)

-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;

-        else

-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

-      } else {

-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;

-      }

-    }

-    vp9_update_zbin_extra(cpi, x);

-    // SET VARIOUS PREDICTION FLAGS

-    // Did the chosen reference frame match its predicted value.

-    ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==

-                      vp9_get_pred_ref(cm, xd)));

-    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);

-  }

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    vp9_build_intra_predictors_sby_s(&x->e_mbd);

-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

+    mbmi->mb_skip_coeff = 1;

     if (output_enabled)

-      sum_intra_stats(cpi, x);

-  } else {

-    int ref_fb_idx;

-    assert(cm->frame_type != KEY_FRAME);

-    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

-    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

-    else

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-    setup_pred_block(&xd->pre,

-                     &cpi->common.yv12_fb[ref_fb_idx],

-                     mb_row, mb_col,

-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-      int second_ref_fb_idx;

-      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

-      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

-      else

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-      setup_pred_block(&xd->second_pre,

-                       &cpi->common.yv12_fb[second_ref_fb_idx],

-                       mb_row, mb_col,

-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

-    }

-    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

-                                       xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride,

-                                       mb_row, mb_col);

+      cm->fc.mbskip_count[mb_skip_context][1]++;

+    vp9_reset_sb_tokens_context(xd,

+                 (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);

-  if (!x->skip) {

-    vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride,

-                         dst, dst_y_stride);

-    vp9_subtract_sbuv_s_c(x->src_diff,

-                          usrc, vsrc, src_uv_stride,

-                          udst, vdst, dst_uv_stride);

-    switch (mi->mbmi.txfm_size) {

-      case TX_32X32:

-        vp9_transform_sby_32x32(x);

-        vp9_transform_sbuv_16x16(x);

-        vp9_quantize_sby_32x32(x);

-        vp9_quantize_sbuv_16x16(x);

-        if (x->optimize) {

-          vp9_optimize_sby_32x32(cm, x);

-          vp9_optimize_sbuv_16x16(cm, x);

-        }

-        vp9_inverse_transform_sby_32x32(xd);

-        vp9_inverse_transform_sbuv_16x16(xd);

-        break;

-      case TX_16X16:

-        vp9_transform_sby_16x16(x);

-        vp9_transform_sbuv_16x16(x);

-        vp9_quantize_sby_16x16(x);

-        vp9_quantize_sbuv_16x16(x);

-        if (x->optimize) {

-          vp9_optimize_sby_16x16(cm, x);

-          vp9_optimize_sbuv_16x16(cm, x);

-        }

-        vp9_inverse_transform_sby_16x16(xd);

-        vp9_inverse_transform_sbuv_16x16(xd);

-        break;

-      case TX_8X8:

-        vp9_transform_sby_8x8(x);

-        vp9_transform_sbuv_8x8(x);

-        vp9_quantize_sby_8x8(x);

-        vp9_quantize_sbuv_8x8(x);

-        if (x->optimize) {

-          vp9_optimize_sby_8x8(cm, x);

-          vp9_optimize_sbuv_8x8(cm, x);

-        }

-        vp9_inverse_transform_sby_8x8(xd);

-        vp9_inverse_transform_sbuv_8x8(xd);

-        break;

-      case TX_4X4:

-        vp9_transform_sby_4x4(x);

-        vp9_transform_sbuv_4x4(x);

-        vp9_quantize_sby_4x4(x);

-        vp9_quantize_sbuv_4x4(x);

-        if (x->optimize) {

-          vp9_optimize_sby_4x4(cm, x);

-          vp9_optimize_sbuv_4x4(cm, x);

-        }

-        vp9_inverse_transform_sby_4x4(xd);

-        vp9_inverse_transform_sbuv_4x4(xd);

-        break;

-      default: assert(0);

-    }

-    vp9_recon_sby_s_c(xd, dst);

-    vp9_recon_sbuv_s_c(xd, udst, vdst);

-#if CONFIG_CODE_NONZEROCOUNT

-    gather_nzcs_sb32(cm, xd);

-#endif

-    vp9_tokenize_sb(cpi, xd, t, !output_enabled);

-  } else {

-    // FIXME(rbultje): not tile-aware (mi - 1)

-    int mb_skip_context = cm->mb_no_coeff_skip ?

-          (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;

-    mi->mbmi.mb_skip_coeff = 1;

-    if (cm->mb_no_coeff_skip) {

-      if (output_enabled)

-        cpi->skip_true_count[mb_skip_context]++;

-      vp9_reset_sb_tokens_context(xd);

-    } else {

-      vp9_stuff_sb(cpi, xd, t, !output_enabled);

-      if (output_enabled)

-        cpi->skip_false_count[mb_skip_context]++;

-    }

-  }

   // copy skip flag on all mb_mode_info contexts in this SB

   // if this was a skip at this txfm size

-  if (mb_col < cm->mb_cols - 1)

-    mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

-  if (mb_row < cm->mb_rows - 1) {

-    mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

-    if (mb_col < cm->mb_cols - 1)

-      mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

+  for (n = 1; n < bw * bh; n++) {

+    const int x_idx = n & (bw - 1), y_idx = n >> bwl;

+    if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)

+      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

   if (output_enabled) {

     if (cm->txfm_mode == TX_MODE_SELECT &&

-        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||

-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

-      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;

-    } else {

-      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;

-      mi->mbmi.txfm_size = sz;

-      if (mb_col < cm->mb_cols - 1)

-        mi[1].mbmi.txfm_size = sz;

-      if (mb_row < cm->mb_rows - 1) {

-        mi[mis].mbmi.txfm_size = sz;

-        if (mb_col < cm->mb_cols - 1)

-          mi[mis + 1].mbmi.txfm_size = sz;

-      }

-    }

-  }

-}

-static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int output_enabled, int mb_row, int mb_col) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCK *const x = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  const uint8_t *src = x->src.y_buffer;

-  uint8_t *dst = xd->dst.y_buffer;

-  const uint8_t *usrc = x->src.u_buffer;

-  uint8_t *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer;

-  uint8_t *vdst = xd->dst.v_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  unsigned char ref_pred_flag;

-  int n;

-  MODE_INFO *mi = x->e_mbd.mode_info_context;

-  unsigned int segment_id = mi->mbmi.segment_id;

-  const int mis = cm->mode_info_stride;

-#ifdef ENC_DEBUG

-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&

-               mb_row == 8 && mb_col == 0 && output_enabled);

-  if (enc_debug)

-    printf("Encode SB64 %d %d output %d\n", mb_row, mb_col, output_enabled);

-#endif

-  if (cm->frame_type == KEY_FRAME) {

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

-      adjust_act_zbin(cpi, x);

-      vp9_update_zbin_extra(cpi, x);

-    }

-  } else {

-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

-      // Adjust the zbin based on this MB rate.

-      adjust_act_zbin(cpi, x);

-    }

-    // Experimental code. Special case for gf and arf zeromv modes.

-    // Increase zbin size to suppress noise

-    cpi->zbin_mode_boost = 0;

-    if (cpi->zbin_mode_boost_enabled) {

-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {

-        if (xd->mode_info_context->mbmi.mode == ZEROMV) {

-          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)

-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

-          else

-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

-        } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {

-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;

-        } else {

-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

-        }

+        mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&

+        !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff ||

+          vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

+      const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);

+      if (bsize >= BLOCK_SIZE_SB32X32) {

+        cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++;

+      } else if (bsize >= BLOCK_SIZE_MB16X16) {

+        cm->fc.tx_count_16x16p[context][mbmi->txfm_size]++;

       } else {

-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;

+        cm->fc.tx_count_8x8p[context][mbmi->txfm_size]++;

-    }

-    vp9_update_zbin_extra(cpi, x);

-    // Did the chosen reference frame match its predicted value.

-    ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==

-                      vp9_get_pred_ref(cm, xd)));

-    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);

-  }

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);

-    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);

-    if (output_enabled)

-      sum_intra_stats(cpi, x);

-  } else {

-    int ref_fb_idx;

-    assert(cm->frame_type != KEY_FRAME);

-    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

-    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

-    else

-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-    setup_pred_block(&xd->pre,

-                     &cpi->common.yv12_fb[ref_fb_idx],

-                     mb_row, mb_col,

-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-      int second_ref_fb_idx;

-      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

-      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

-      else

-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-      setup_pred_block(&xd->second_pre,

-                       &cpi->common.yv12_fb[second_ref_fb_idx],

-                       mb_row, mb_col,

-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

-    }

-    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,

-                                       xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride,

-                                       mb_row, mb_col);

-  }

-  if (!x->skip) {

-    vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);

-    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,

-                            udst, vdst, dst_uv_stride);

-    switch (xd->mode_info_context->mbmi.txfm_size) {

-      case TX_32X32:

-        vp9_transform_sb64y_32x32(x);

-        vp9_transform_sb64uv_32x32(x);

-        vp9_quantize_sb64y_32x32(x);

-        vp9_quantize_sb64uv_32x32(x);

-        if (x->optimize) {

-          vp9_optimize_sb64y_32x32(cm, x);

-          vp9_optimize_sb64uv_32x32(cm, x);

-        }

-        vp9_inverse_transform_sb64y_32x32(xd);

-        vp9_inverse_transform_sb64uv_32x32(xd);

-        break;

-      case TX_16X16:

-        vp9_transform_sb64y_16x16(x);

-        vp9_transform_sb64uv_16x16(x);

-        vp9_quantize_sb64y_16x16(x);

-        vp9_quantize_sb64uv_16x16(x);

-        if (x->optimize) {

-          vp9_optimize_sb64y_16x16(cm, x);

-          vp9_optimize_sb64uv_16x16(cm, x);

-        }

-        vp9_inverse_transform_sb64y_16x16(xd);

-        vp9_inverse_transform_sb64uv_16x16(xd);

-        break;

-      case TX_8X8:

-        vp9_transform_sb64y_8x8(x);

-        vp9_transform_sb64uv_8x8(x);

-        vp9_quantize_sb64y_8x8(x);

-        vp9_quantize_sb64uv_8x8(x);

-        if (x->optimize) {

-          vp9_optimize_sb64y_8x8(cm, x);

-          vp9_optimize_sb64uv_8x8(cm, x);

-        }

-        vp9_inverse_transform_sb64y_8x8(xd);

-        vp9_inverse_transform_sb64uv_8x8(xd);

-        break;

-      case TX_4X4:

-        vp9_transform_sb64y_4x4(x);

-        vp9_transform_sb64uv_4x4(x);

-        vp9_quantize_sb64y_4x4(x);

-        vp9_quantize_sb64uv_4x4(x);

-        if (x->optimize) {

-          vp9_optimize_sb64y_4x4(cm, x);

-          vp9_optimize_sb64uv_4x4(cm, x);

-        }

-        vp9_inverse_transform_sb64y_4x4(xd);

-        vp9_inverse_transform_sb64uv_4x4(xd);

-        break;

-      default: assert(0);

-    }

-    vp9_recon_sb64y_s_c(xd, dst);

-    vp9_recon_sb64uv_s_c(&x->e_mbd, udst, vdst);

-#if CONFIG_CODE_NONZEROCOUNT

-    gather_nzcs_sb64(cm, &x->e_mbd);

-#endif

-    vp9_tokenize_sb64(cpi, &x->e_mbd, t, !output_enabled);

-  } else {

-    // FIXME(rbultje): not tile-aware (mi - 1)

-    int mb_skip_context = cpi->common.mb_no_coeff_skip ?

-        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;

-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-    if (cm->mb_no_coeff_skip) {

-      if (output_enabled)

-        cpi->skip_true_count[mb_skip_context]++;

-      vp9_reset_sb64_tokens_context(xd);

     } else {

-      vp9_stuff_sb64(cpi, xd, t, !output_enabled);

-      if (output_enabled)

-        cpi->skip_false_count[mb_skip_context]++;

-    }

-  }

-  // copy skip flag on all mb_mode_info contexts in this SB

-  // if this was a skip at this txfm size

-  for (n = 1; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    if (mb_col + x_idx < cm->mb_cols && mb_row + y_idx < cm->mb_rows)

-      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

-  }

-  if (output_enabled) {

-    if (cm->txfm_mode == TX_MODE_SELECT &&

-        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||

-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

-      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;

-    } else {

       int x, y;

       TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;

-      for (y = 0; y < 4; y++) {

-        for (x = 0; x < 4; x++) {

-          if (mb_col + x < cm->mb_cols && mb_row + y < cm->mb_rows) {

+       // The new intra coding scheme requires no change of transform size

+      if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {

+        if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)

+          sz = TX_16X16;

+        if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)

+          sz = TX_8X8;

+        if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)

+          sz = TX_4X4;

+      } else if (bsize >= BLOCK_SIZE_SB8X8) {

+        sz = mbmi->txfm_size;

+      } else {

+        sz = TX_4X4;

+      }

+      for (y = 0; y < bh; y++) {

+        for (x = 0; x < bw; x++) {

+          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {

             mi[mis * y + x].mbmi.txfm_size = sz;

--- a/vp9/encoder/vp9_encodeframe.h

+++ b/vp9/encoder/vp9_encodeframe.h

@@ -13,9 +13,12 @@

 #define VP9_ENCODER_VP9_ENCODEFRAME_H_

 struct macroblock;

+struct yv12_buffer_config;

 void vp9_build_block_offsets(struct macroblock *x);

-void vp9_setup_block_ptrs(struct macroblock *x);

+void vp9_setup_src_planes(struct macroblock *x,

+                          const struct yv12_buffer_config *src,

+                          int mb_row, int mb_col);

 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_

--- a/vp9/encoder/vp9_encodeintra.c

+++ b/vp9/encoder/vp9_encodeintra.c

@@ -19,217 +19,15 @@

 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {

   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

   (void) cpi;

+  mbmi->mode = DC_PRED;

+  mbmi->ref_frame[0] = INTRA_FRAME;

   if (use_16x16_pred) {

-    mbmi->mode = DC_PRED;

-    mbmi->uv_mode = DC_PRED;

-    mbmi->ref_frame = INTRA_FRAME;

-    vp9_encode_intra16x16mby(&cpi->common, x);

+    mbmi->txfm_size = mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? TX_16X16 : TX_8X8;

+    vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);

   } else {

-    int i;

-    for (i = 0; i < 16; i++) {

-      x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;

-      vp9_encode_intra4x4block(x, i);

-    }

+    mbmi->txfm_size = TX_4X4;

+    vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);

-  return vp9_get_mb_ss(x->src_diff);

-}

-void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {

-  BLOCKD *b = &x->e_mbd.block[ib];

-  BLOCK *be = &x->block[ib];

-  TX_TYPE tx_type;

-#if CONFIG_NEWBINTRAMODES

-  b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b);

-#endif

-  vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);

-  vp9_subtract_b(be, b, 16);

-  tx_type = get_tx_type_4x4(&x->e_mbd, ib);

-  if (tx_type != DCT_DCT) {

-    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

-    vp9_ht_quantize_b_4x4(x, ib, tx_type);

-    vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);

-  } else {

-    x->fwd_txm4x4(be->src_diff, be->coeff, 32);

-    x->quantize_b_4x4(x, ib);

-    vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],

-                                b->dqcoeff, b->diff, 32);

-  }

-  vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-}

-void vp9_encode_intra4x4mby(MACROBLOCK *mb) {

-  int i;

-  for (i = 0; i < 16; i++)

-    vp9_encode_intra4x4block(mb, i);

-}

-void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  vp9_build_intra_predictors_mby(xd);

-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

-  switch (tx_size) {

-    case TX_16X16:

-      vp9_transform_mby_16x16(x);

-      vp9_quantize_mby_16x16(x);

-      if (x->optimize)

-        vp9_optimize_mby_16x16(cm, x);

-      vp9_inverse_transform_mby_16x16(xd);

-      break;

-    case TX_8X8:

-      vp9_transform_mby_8x8(x);

-      vp9_quantize_mby_8x8(x);

-      if (x->optimize)

-        vp9_optimize_mby_8x8(cm, x);

-      vp9_inverse_transform_mby_8x8(xd);

-      break;

-    default:

-      vp9_transform_mby_4x4(x);

-      vp9_quantize_mby_4x4(x);

-      if (x->optimize)

-        vp9_optimize_mby_4x4(cm, x);

-      vp9_inverse_transform_mby_4x4(xd);

-      break;

-  }

-  vp9_recon_mby(xd);

-}

-void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  vp9_build_intra_predictors_mbuv(xd);

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    xd->predictor, x->src.uv_stride);

-  switch (tx_size) {

-    case TX_4X4:

-      vp9_transform_mbuv_4x4(x);

-      vp9_quantize_mbuv_4x4(x);

-      if (x->optimize)

-        vp9_optimize_mbuv_4x4(cm, x);

-      vp9_inverse_transform_mbuv_4x4(xd);

-      break;

-    default:  // 16x16 or 8x8

-      vp9_transform_mbuv_8x8(x);

-      vp9_quantize_mbuv_8x8(x);

-      if (x->optimize)

-        vp9_optimize_mbuv_8x8(cm, x);

-      vp9_inverse_transform_mbuv_8x8(xd);

-      break;

-    }

-  vp9_recon_intra_mbuv(xd);

-}

-void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  BLOCKD *b = &xd->block[ib];

-  BLOCK *be = &x->block[ib];

-  const int iblock[4] = {0, 1, 4, 5};

-  int i;

-  TX_TYPE tx_type;

-  vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first, b->predictor);

-  // generate residual blocks

-  vp9_subtract_4b_c(be, b, 16);

-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-    int idx = (ib & 0x02) ? (ib + 2) : ib;

-    tx_type = get_tx_type_8x8(xd, ib);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);

-      x->quantize_b_8x8(x, idx, tx_type);

-      vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,

-                            16, tx_type);

-    } else {

-      x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);

-      x->quantize_b_8x8(x, idx, DCT_DCT);

-      vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);

-    }

-  } else {

-    for (i = 0; i < 4; i++) {

-      b = &xd->block[ib + iblock[i]];

-      be = &x->block[ib + iblock[i]];

-      tx_type = get_tx_type_4x4(xd, ib + iblock[i]);

-      if (tx_type != DCT_DCT) {

-        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

-        vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);

-        vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);

-      } else if (!(i & 1) &&

-                 get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {

-        x->fwd_txm8x4(be->src_diff, be->coeff, 32);

-        x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);

-        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],

-                                    b->dqcoeff, b->diff, 32);

-        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i] + 1],

-                                    (b + 1)->dqcoeff, (b + 1)->diff, 32);

-        i++;

-      } else {

-        x->fwd_txm4x4(be->src_diff, be->coeff, 32);

-        x->quantize_b_4x4(x, ib + iblock[i]);

-        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],

-                                    b->dqcoeff, b->diff, 32);

-      }

-    }

-  }

-  // reconstruct submacroblock

-  for (i = 0; i < 4; i++) {

-    b = &xd->block[ib + iblock[i]];

-    vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,

-                  b->dst_stride);

-  }

-}

-void vp9_encode_intra8x8mby(MACROBLOCK *x) {

-  int i;

-  for (i = 0; i < 4; i++)

-    vp9_encode_intra8x8(x, vp9_i8x8_block[i]);

-}

-static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {

-  BLOCKD *b = &x->e_mbd.block[ib];

-  BLOCK *be = &x->block[ib];

-  vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, b->predictor);

-  vp9_subtract_b(be, b, 8);

-  x->fwd_txm4x4(be->src_diff, be->coeff, 16);

-  x->quantize_b_4x4(x, ib);

-  vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],

-                              b->dqcoeff, b->diff, 16);

-  vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,

-                   b->dst_stride);

-}

-void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    BLOCKD *b = &x->e_mbd.block[vp9_i8x8_block[i]];

-    int mode = b->bmi.as_mode.first;

-    encode_intra_uv4x4(x, i + 16, mode);  // u

-    encode_intra_uv4x4(x, i + 20, mode);  // v

-  }

+  return vp9_get_mb_ss(x->plane[0].src_diff);

--- a/vp9/encoder/vp9_encodeintra.h

+++ b/vp9/encoder/vp9_encodeintra.h

@@ -14,12 +14,9 @@

 #include "vp9/encoder/vp9_onyx_int.h"

 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);

-void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_encode_intra4x4mby(MACROBLOCK *mb);

-void vp9_encode_intra4x4block(MACROBLOCK *x, int ib);

-void vp9_encode_intra8x8mby(MACROBLOCK *x);

-void vp9_encode_intra8x8mbuv(MACROBLOCK *x);

-void vp9_encode_intra8x8(MACROBLOCK *x, int ib);

+void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,

+                              BLOCK_SIZE_TYPE bs);

+void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,

+                               BLOCK_SIZE_TYPE bs);

 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -20,481 +20,55 @@

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9_rtcd.h"

-void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {

-  uint8_t *src_ptr = (*(be->base_src) + be->src);

-  int16_t *diff_ptr = be->src_diff;

-  uint8_t *pred_ptr = bd->predictor;

-  int src_stride = be->src_stride;

+DECLARE_ALIGNED(16, extern const uint8_t,

+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);

+void vp9_subtract_block(int rows, int cols,

+                        int16_t *diff_ptr, int diff_stride,

+                        const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *pred_ptr, int pred_stride) {

   int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

+  for (r = 0; r < rows; r++) {

+    for (c = 0; c < cols; c++)

       diff_ptr[c] = src_ptr[c] - pred_ptr[c];

-    diff_ptr += pitch;

-    pred_ptr += pitch;

+    diff_ptr += diff_stride;

+    pred_ptr += pred_stride;

     src_ptr  += src_stride;

-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {

-  uint8_t *src_ptr = (*(be->base_src) + be->src);

-  int16_t *diff_ptr = be->src_diff;

-  uint8_t *pred_ptr = bd->predictor;

-  int src_stride = be->src_stride;

-  int r, c;

-  for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++)

-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];

+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {

+  struct macroblock_plane *const p = &x->plane[plane];

+  const MACROBLOCKD *const xd = &x->e_mbd;

+  const struct macroblockd_plane *const pd = &xd->plane[plane];

+  const int bw = plane_block_width(bsize, pd);

+  const int bh = plane_block_height(bsize, pd);

-    diff_ptr += pitch;

-    pred_ptr += pitch;

-    src_ptr  += src_stride;

-  }

+  vp9_subtract_block(bh, bw, p->src_diff, bw,

+                     p->src.buf, p->src.stride,

+                     pd->dst.buf, pd->dst.stride);

-void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,

-                           const uint8_t *vsrc, int src_stride,

-                           const uint8_t *upred,

-                           const uint8_t *vpred, int dst_stride) {

-  int16_t *udiff = diff + 256;

-  int16_t *vdiff = diff + 320;

-  int r, c;

-  for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++)

-      udiff[c] = usrc[c] - upred[c];

-    udiff += 8;

-    upred += dst_stride;

-    usrc  += src_stride;

-  }

-  for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++) {

-      vdiff[c] = vsrc[c] - vpred[c];

-    }

-    vdiff += 8;

-    vpred += dst_stride;

-    vsrc  += src_stride;

-  }

+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  subtract_plane(x, bsize, 0);

-void vp9_subtract_mbuv_c(int16_t *diff, uint8_t *usrc,

-                         uint8_t *vsrc, uint8_t *pred, int stride) {

-  uint8_t *upred = pred + 256;

-  uint8_t *vpred = pred + 320;

-  vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);

-}

-void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src, int src_stride,

-                          const uint8_t *pred, int dst_stride) {

-  int r, c;

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++)

-      diff[c] = src[c] - pred[c];

-    diff += 16;

-    pred += dst_stride;

-    src  += src_stride;

-  }

-}

-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,

-                          const uint8_t *pred, int dst_stride) {

-  int r, c;

-  for (r = 0; r < 32; r++) {

-    for (c = 0; c < 32; c++)

-      diff[c] = src[c] - pred[c];

-    diff += 32;

-    pred += dst_stride;

-    src  += src_stride;

-  }

-}

-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,

-                           const uint8_t *vsrc, int src_stride,

-                           const uint8_t *upred,

-                           const uint8_t *vpred, int dst_stride) {

-  int16_t *udiff = diff + 1024;

-  int16_t *vdiff = diff + 1024 + 256;

-  int r, c;

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++)

-      udiff[c] = usrc[c] - upred[c];

-    udiff += 16;

-    upred += dst_stride;

-    usrc  += src_stride;

-  }

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++)

-      vdiff[c] = vsrc[c] - vpred[c];

-    vdiff += 16;

-    vpred += dst_stride;

-    vsrc  += src_stride;

-  }

-}

-void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,

-                            const uint8_t *pred, int dst_stride) {

-  int r, c;

-  for (r = 0; r < 64; r++) {

-    for (c = 0; c < 64; c++) {

-      diff[c] = src[c] - pred[c];

-    }

-    diff += 64;

-    pred += dst_stride;

-    src  += src_stride;

-  }

-}

-void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,

-                             const uint8_t *vsrc, int src_stride,

-                             const uint8_t *upred,

-                             const uint8_t *vpred, int dst_stride) {

-  int16_t *udiff = diff + 4096;

-  int16_t *vdiff = diff + 4096 + 1024;

-  int r, c;

-  for (r = 0; r < 32; r++) {

-    for (c = 0; c < 32; c++) {

-      udiff[c] = usrc[c] - upred[c];

-    }

-    udiff += 32;

-    upred += dst_stride;

-    usrc  += src_stride;

-  }

-  for (r = 0; r < 32; r++) {

-    for (c = 0; c < 32; c++) {

-      vdiff[c] = vsrc[c] - vpred[c];

-    }

-    vdiff += 32;

-    vpred += dst_stride;

-    vsrc  += src_stride;

-  }

-}

-void vp9_subtract_mby_c(int16_t *diff, uint8_t *src,

-                        uint8_t *pred, int stride) {

-  vp9_subtract_mby_s_c(diff, src, stride, pred, 16);

-}

-static void subtract_mb(MACROBLOCK *x) {

-  BLOCK *b = &x->block[0];

-  vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,

-                   b->src_stride);

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    x->e_mbd.predictor, x->src.uv_stride);

-}

-void vp9_transform_mby_4x4(MACROBLOCK *x) {

+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

   int i;

-  MACROBLOCKD *xd = &x->e_mbd;

-  for (i = 0; i < 16; i++) {

-    BLOCK *b = &x->block[i];

-    TX_TYPE tx_type = get_tx_type_4x4(xd, i);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type);

-    } else if (!(i & 1) && get_tx_type_4x4(xd, i + 1) == DCT_DCT) {

-      x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32);

-      i++;

-    } else {

-      x->fwd_txm4x4(x->block[i].src_diff, x->block[i].coeff, 32);

-    }

-  }

+  for (i = 1; i < MAX_MB_PLANE; i++)

+    subtract_plane(x, bsize, i);

-void vp9_transform_mbuv_4x4(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i += 2)

-    x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 16);

+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  vp9_subtract_sby(x, bsize);

+  vp9_subtract_sbuv(x, bsize);

-static void transform_mb_4x4(MACROBLOCK *x) {

-  vp9_transform_mby_4x4(x);

-  vp9_transform_mbuv_4x4(x);

-}

-void vp9_transform_mby_8x8(MACROBLOCK *x) {

-  int i;

-  MACROBLOCKD *xd = &x->e_mbd;

-  TX_TYPE tx_type;

-  for (i = 0; i < 9; i += 8) {

-    BLOCK *b = &x->block[i];

-    tx_type = get_tx_type_8x8(xd, i);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type);

-    } else {

-      x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 32);

-    }

-  }

-  for (i = 2; i < 11; i += 8) {

-    BLOCK *b = &x->block[i];

-    tx_type = get_tx_type_8x8(xd, i);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type);

-    } else {

-      x->fwd_txm8x8(x->block[i].src_diff, x->block[i + 2].coeff, 32);

-    }

-  }

-}

-void vp9_transform_mbuv_8x8(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i += 4)

-    x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 16);

-}

-void vp9_transform_mb_8x8(MACROBLOCK *x) {

-  vp9_transform_mby_8x8(x);

-  vp9_transform_mbuv_8x8(x);

-}

-void vp9_transform_mby_16x16(MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);

-  vp9_clear_system_state();

-  if (tx_type != DCT_DCT) {

-    vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type);

-  } else {

-    x->fwd_txm16x16(x->block[0].src_diff, x->block[0].coeff, 32);

-  }

-}

-void vp9_transform_mb_16x16(MACROBLOCK *x) {

-  vp9_transform_mby_16x16(x);

-  vp9_transform_mbuv_8x8(x);

-}

-void vp9_transform_sby_32x32(MACROBLOCK *x) {

-  vp9_short_fdct32x32(x->src_diff, x->coeff, 64);

-}

-void vp9_transform_sby_16x16(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,

-                         x->coeff + n * 256, 32, tx_type);

-    } else {

-      x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,

-                      x->coeff + n * 256, 64);

-    }

-  }

-}

-void vp9_transform_sby_8x8(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,

-                       x->coeff + n * 64, 32, tx_type);

-    } else {

-      x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,

-                    x->coeff + n * 64, 64);

-    }

-  }

-}

-void vp9_transform_sby_4x4(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3;

-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,

-                       x->coeff + n * 16, 32, tx_type);

-    } else {

-      x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,

-                    x->coeff + n * 16, 64);

-    }

-  }

-}

-void vp9_transform_sbuv_16x16(MACROBLOCK *x) {

-  vp9_clear_system_state();

-  x->fwd_txm16x16(x->src_diff + 1024, x->coeff + 1024, 32);

-  x->fwd_txm16x16(x->src_diff + 1280, x->coeff + 1280, 32);

-}

-void vp9_transform_sbuv_8x8(MACROBLOCK *x) {

-  int n;

-  vp9_clear_system_state();

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->fwd_txm8x8(x->src_diff + 1024 + y_idx * 16 * 8 + x_idx * 8,

-                  x->coeff + 1024 + n * 64, 32);

-    x->fwd_txm8x8(x->src_diff + 1280 + y_idx * 16 * 8 + x_idx * 8,

-                  x->coeff + 1280 + n * 64, 32);

-  }

-}

-void vp9_transform_sbuv_4x4(MACROBLOCK *x) {

-  int n;

-  vp9_clear_system_state();

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    x->fwd_txm4x4(x->src_diff + 1024 + y_idx * 16 * 4 + x_idx * 4,

-                  x->coeff + 1024 + n * 16, 32);

-    x->fwd_txm4x4(x->src_diff + 1280 + y_idx * 16 * 4 + x_idx * 4,

-                  x->coeff + 1280 + n * 16, 32);

-  }

-}

-void vp9_transform_sb64y_32x32(MACROBLOCK *x) {

-  int n;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    vp9_short_fdct32x32(x->src_diff + y_idx * 64 * 32 + x_idx * 32,

-                        x->coeff + n * 1024, 128);

-  }

-}

-void vp9_transform_sb64y_16x16(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,

-                         x->coeff + n * 256, 64, tx_type);

-    } else {

-      x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,

-                      x->coeff + n * 256, 128);

-    }

-  }

-}

-void vp9_transform_sb64y_8x8(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3;

-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,

-                         x->coeff + n * 64, 64, tx_type);

-    } else {

-      x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,

-                    x->coeff + n * 64, 128);

-    }

-  }

-}

-void vp9_transform_sb64y_4x4(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 256; n++) {

-    const int x_idx = n & 15, y_idx = n >> 4;

-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 4 + x_idx * 4,

-                       x->coeff + n * 16, 64, tx_type);

-    } else {

-      x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4,

-                    x->coeff + n * 16, 128);

-    }

-  }

-}

-void vp9_transform_sb64uv_32x32(MACROBLOCK *x) {

-  vp9_clear_system_state();

-  vp9_short_fdct32x32(x->src_diff + 4096,

-                      x->coeff + 4096, 64);

-  vp9_short_fdct32x32(x->src_diff + 4096 + 1024,

-                      x->coeff + 4096 + 1024, 64);

-}

-void vp9_transform_sb64uv_16x16(MACROBLOCK *x) {

-  int n;

-  vp9_clear_system_state();

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->fwd_txm16x16(x->src_diff + 4096 + y_idx * 32 * 16 + x_idx * 16,

-                    x->coeff + 4096 + n * 256, 64);

-    x->fwd_txm16x16(x->src_diff + 4096 + 1024 + y_idx * 32 * 16 + x_idx * 16,

-                    x->coeff + 4096 + 1024 + n * 256, 64);

-  }

-}

-void vp9_transform_sb64uv_8x8(MACROBLOCK *x) {

-  int n;

-  vp9_clear_system_state();

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    x->fwd_txm8x8(x->src_diff + 4096 + y_idx * 32 * 8 + x_idx * 8,

-                  x->coeff + 4096 + n * 64, 64);

-    x->fwd_txm8x8(x->src_diff + 4096 + 1024 + y_idx * 32 * 8 + x_idx * 8,

-                  x->coeff + 4096 + 1024 + n * 64, 64);

-  }

-}

-void vp9_transform_sb64uv_4x4(MACROBLOCK *x) {

-  int n;

-  vp9_clear_system_state();

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3;

-    x->fwd_txm4x4(x->src_diff + 4096 + y_idx * 32 * 4 + x_idx * 4,

-                  x->coeff + 4096 + n * 16, 64);

-    x->fwd_txm4x4(x->src_diff + 4096 + 1024 + y_idx * 32 * 4 + x_idx * 4,

-                  x->coeff + 4096 + 1024 + n * 16, 64);

-  }

-}

 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

 typedef struct vp9_token_state vp9_token_state;

@@ -533,126 +107,84 @@

                                      int idx, int token,

                                      uint8_t *token_cache,

                                      int pad, int l) {

-  int bak = token_cache[idx], pt;

-  token_cache[idx] = token;

+  int bak = token_cache[scan[idx]], pt;

+  token_cache[scan[idx]] = vp9_pt_energy_class[token];

   pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);

-  token_cache[idx] = bak;

+  token_cache[scan[idx]] = bak;

   return pt;

-static void optimize_b(VP9_COMMON *const cm,

-                       MACROBLOCK *mb, int ib, PLANE_TYPE type,

-                       const int16_t *dequant_ptr,

+static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,

+                       int plane, int block, BLOCK_SIZE_TYPE bsize,

                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                       int tx_size) {

-  const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME;

+                       TX_SIZE tx_size) {

+  const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;

   MACROBLOCKD *const xd = &mb->e_mbd;

   vp9_token_state tokens[1025][2];

   unsigned best_index[1025][2];

-  const int16_t *coeff_ptr = mb->coeff + ib * 16;

-  int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;

-  int16_t *dqcoeff_ptr = xd->dqcoeff + ib * 16;

-  int eob = xd->eobs[ib], final_eob, sz = 0;

+  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff,

+                                          block, 16);

+  int16_t *qcoeff_ptr;

+  int16_t *dqcoeff_ptr;

+  int eob = xd->plane[plane].eobs[block], final_eob, sz = 0;

   const int i0 = 0;

   int rc, x, next, i;

   int64_t rdmult, rddiv, rd_cost0, rd_cost1;

   int rate0, rate1, error0, error1, t0, t1;

   int best, band, pt;

+  PLANE_TYPE type = xd->plane[plane].plane_type;

   int err_mult = plane_rd_mult[type];

   int default_eob, pad;

   int const *scan, *nb;

   const int mul = 1 + (tx_size == TX_32X32);

   uint8_t token_cache[1024];

-#if CONFIG_CODE_NONZEROCOUNT

-  // TODO(debargha): the dynamic programming approach used in this function

-  // is not compatible with the true rate cost when nzcs are used. Note

-  // the total rate is the sum of the nzc rate and the indicvidual token

-  // rates. The latter part can be optimized in this function, but because

-  // the nzc rate is a function of all the other tokens without a Markov

-  // relationship this rate cannot be considered correctly.

-  // The current implementation uses a suboptimal approach to account for

-  // the nzc rates somewhat, but in reality the optimization approach needs

-  // to change substantially.

-  uint16_t nzc = xd->nzcs[ib];

-  uint16_t nzc0, nzc1;

-  uint16_t final_nzc = 0, final_nzc_exp;

-  int nzc_context = vp9_get_nzc_context(cm, xd, ib);

-  unsigned int *nzc_cost;

-  nzc0 = nzc1 = nzc;

-#endif

+  const int ib = txfrm_block_to_raster_block(xd, bsize, plane,

+                                             block, 2 * tx_size);

+  const int16_t *dequant_ptr = xd->plane[plane].dequant;

+  const uint8_t * band_translate;

+  assert((!type && !plane) || (type && plane));

+  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);

+  qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);

   switch (tx_size) {

     default:

     case TX_4X4: {

-      const TX_TYPE tx_type = get_tx_type_4x4(xd, ib);

+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;

       default_eob = 16;

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];

-#endif

-      if (tx_type == DCT_ADST) {

-        scan = vp9_col_scan_4x4;

-      } else if (tx_type == ADST_DCT) {

-        scan = vp9_row_scan_4x4;

-      } else {

-        scan = vp9_default_zig_zag1d_4x4;

-      }

+      scan = get_scan_4x4(tx_type);

+      band_translate = vp9_coefband_trans_4x4;

       break;

     case TX_8X8: {

-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

-      const TX_TYPE tx_type = get_tx_type_8x8(xd, y + (x >> 1));

-      if (tx_type == DCT_ADST) {

-        scan = vp9_col_scan_8x8;

-      } else if (tx_type == ADST_DCT) {

-        scan = vp9_row_scan_8x8;

-      } else {

-        scan = vp9_default_zig_zag1d_8x8;

-      }

+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;

+      scan = get_scan_8x8(tx_type);

       default_eob = 64;

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];

-#endif

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_16X16: {

-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

-      const TX_TYPE tx_type = get_tx_type_16x16(xd, y + (x >> 2));

-      if (tx_type == DCT_ADST) {

-        scan = vp9_col_scan_16x16;

-      } else if (tx_type == ADST_DCT) {

-        scan = vp9_row_scan_16x16;

-      } else {

-        scan = vp9_default_zig_zag1d_16x16;

-      }

+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;

+      scan = get_scan_16x16(tx_type);

       default_eob = 256;

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];

-#endif

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_32X32:

-      scan = vp9_default_zig_zag1d_32x32;

+      scan = vp9_default_scan_32x32;

       default_eob = 1024;

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];

-#endif

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

+  assert(eob <= default_eob);

   /* Now set up a Viterbi trellis to evaluate alternative roundings. */

   rdmult = mb->rdmult * err_mult;

-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)

+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)

     rdmult = (rdmult * 9) >> 4;

   rddiv = mb->rddiv;

   memset(best_index, 0, sizeof(best_index));

   /* Initialize the sentinel node of the trellis. */

-#if CONFIG_CODE_NONZEROCOUNT

-  tokens[eob][0].rate = nzc_cost[nzc];

-#else

   tokens[eob][0].rate = 0;

-#endif

   tokens[eob][0].error = 0;

   tokens[eob][0].next = default_eob;

   tokens[eob][0].token = DCT_EOB_TOKEN;

@@ -660,14 +192,12 @@

   *(tokens[eob] + 1) = *(tokens[eob] + 0);

   next = eob;

   for (i = 0; i < eob; i++)

-    token_cache[i] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].Token;

+    token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[

+        qcoeff_ptr[scan[i]]].token];

   nb = vp9_get_coef_neighbors_handle(scan, &pad);

   for (i = eob; i-- > i0;) {

     int base_bits, d2, dx;

-#if CONFIG_CODE_NONZEROCOUNT

-    int new_nzc0, new_nzc1;

-#endif

     rc = scan[i];

     x = qcoeff_ptr[rc];

@@ -679,16 +209,18 @@

       /* Evaluate the first possibility for this state. */

       rate0 = tokens[next][0].rate;

       rate1 = tokens[next][1].rate;

-      t0 = (vp9_dct_value_tokens_ptr + x)->Token;

+      t0 = (vp9_dct_value_tokens_ptr + x)->token;

       /* Consider both possible successor states. */

       if (next < default_eob) {

-        band = get_coef_band(scan, tx_size, i + 1);

+        band = get_coef_band(band_translate, i + 1);

         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,

                                        pad, default_eob);

         rate0 +=

-          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][0].token];

+          mb->token_costs_noskip[tx_size][type][ref][band][pt]

+                                [tokens[next][0].token];

         rate1 +=

-          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][1].token];

+          mb->token_costs_noskip[tx_size][type][ref][band][pt]

+                                [tokens[next][1].token];

       UPDATE_RD_COST();

       /* And pick the best. */

@@ -702,9 +234,6 @@

       tokens[i][0].token = t0;

       tokens[i][0].qc = x;

       best_index[i][0] = best;

-#if CONFIG_CODE_NONZEROCOUNT

-      new_nzc0 = (best ? nzc1 : nzc0);

-#endif

       /* Evaluate the second possibility for this state. */

       rate0 = tokens[next][0].rate;

@@ -731,28 +260,30 @@

              DCT_EOB_TOKEN : ZERO_TOKEN;

         t1 = tokens[next][1].token == DCT_EOB_TOKEN ?

              DCT_EOB_TOKEN : ZERO_TOKEN;

-#if CONFIG_CODE_NONZEROCOUNT

-        // Account for rate drop because of the nzc change.

-        // TODO(debargha): Find a better solution

-        rate0 -= nzc_cost[nzc0] - nzc_cost[nzc0 - 1];

-        rate1 -= nzc_cost[nzc1] - nzc_cost[nzc1 - 1];

-#endif

       } else {

-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;

+        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;

       if (next < default_eob) {

-        band = get_coef_band(scan, tx_size, i + 1);

+        band = get_coef_band(band_translate, i + 1);

         if (t0 != DCT_EOB_TOKEN) {

           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,

                                          pad, default_eob);

-          rate0 += mb->token_costs[tx_size][type][ref][band][pt][

-              tokens[next][0].token];

+          if (!x)

+            rate0 += mb->token_costs[tx_size][type][ref][band][pt][

+                tokens[next][0].token];

+          else

+            rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][

+                tokens[next][0].token];

         if (t1 != DCT_EOB_TOKEN) {

           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,

                                          pad, default_eob);

-          rate1 += mb->token_costs[tx_size][type][ref][band][pt][

-              tokens[next][1].token];

+          if (!x)

+            rate1 += mb->token_costs[tx_size][type][ref][band][pt][

+                tokens[next][1].token];

+          else

+            rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][

+                tokens[next][1].token];

@@ -771,11 +302,6 @@

       tokens[i][1].token = best ? t1 : t0;

       tokens[i][1].qc = x;

       best_index[i][1] = best;

-#if CONFIG_CODE_NONZEROCOUNT

-      new_nzc1 = (best ? nzc1 : nzc0) - (!x);

-      nzc0 = new_nzc0;

-      nzc1 = new_nzc1;

-#endif

       /* Finally, make this the new head of the trellis. */

       next = i;

@@ -783,7 +309,7 @@

      *  add a new trellis node, but we do need to update the costs.

*/

     else {

-      band = get_coef_band(scan, tx_size, i + 1);

+      band = get_coef_band(band_translate, i + 1);

       t0 = tokens[next][0].token;

       t1 = tokens[next][1].token;

       /* Update the cost of each path if we're past the EOB token. */

@@ -802,8 +328,8 @@

   /* Now pick the best path through the whole trellis. */

-  band = get_coef_band(scan, tx_size, i + 1);

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  band = get_coef_band(band_translate, i + 1);

+  pt = combine_entropy_contexts(*a, *l);

   rate0 = tokens[next][0].rate;

   rate1 = tokens[next][1].rate;

   error0 = tokens[next][0].error;

@@ -810,21 +336,17 @@

   error1 = tokens[next][1].error;

   t0 = tokens[next][0].token;

   t1 = tokens[next][1].token;

-  rate0 += mb->token_costs[tx_size][type][ref][band][pt][t0];

-  rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];

+  rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0];

+  rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1];

   UPDATE_RD_COST();

   best = rd_cost1 < rd_cost0;

-#if CONFIG_CODE_NONZEROCOUNT

-  final_nzc_exp = (best ? nzc1 : nzc0);

-#endif

   final_eob = i0 - 1;

+  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));

+  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));

   for (i = next; i < eob; i = next) {

     x = tokens[i][best].qc;

     if (x) {

       final_eob = i;

-#if CONFIG_CODE_NONZEROCOUNT

-      ++final_nzc;

-#endif

     rc = scan[i];

     qcoeff_ptr[rc] = x;

@@ -835,519 +357,338 @@

   final_eob++;

-  xd->eobs[ib] = final_eob;

+  xd->plane[plane].eobs[block] = final_eob;

   *a = *l = (final_eob > 0);

-#if CONFIG_CODE_NONZEROCOUNT

-  assert(final_nzc == final_nzc_exp);

-  xd->nzcs[ib] = final_nzc;

-#endif

-void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int b;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

+struct optimize_block_args {

+  VP9_COMMON *cm;

+  MACROBLOCK *x;

+  struct optimize_ctx *ctx;

+};

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,

+                    struct optimize_ctx *ctx) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  int x, y;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  // find current entropy context

+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  for (b = 0; b < 16; b++) {

-    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,

-               ta + vp9_block2above[TX_4X4][b],

-               tl + vp9_block2left[TX_4X4][b], TX_4X4);

-  }

+  optimize_b(cm, mb, plane, block, bsize,

+             &ctx->ta[plane][x], &ctx->tl[plane][y], ss_txfrm_size / 2);

-void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int b;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  for (b = 16; b < 24; b++) {

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,

-               ta + vp9_block2above[TX_4X4][b],

-               tl + vp9_block2left[TX_4X4][b], TX_4X4);

-  }

+static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                           int ss_txfrm_size, void *arg) {

+  const struct optimize_block_args* const args = arg;

+  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,

+                 args->ctx);

-static void optimize_mb_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  vp9_optimize_mby_4x4(cm, x);

-  vp9_optimize_mbuv_4x4(cm, x);

-}

+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,

+                       struct optimize_ctx *ctx) {

+  int p;

-void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int b;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

+  for (p = 0; p < MAX_MB_PLANE; p++) {

+    const struct macroblockd_plane* const plane = &xd->plane[p];

+    const int bwl = b_width_log2(bsize) - plane->subsampling_x;

+    const int bhl = b_height_log2(bsize) - plane->subsampling_y;

+    const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+    const TX_SIZE tx_size = p ? get_uv_tx_size(mbmi)

+                              : mbmi->txfm_size;

+    int i, j;

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  for (b = 0; b < 16; b += 4) {

-    ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];

-    ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];

-    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;

-    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,

-               &above_ec, &left_ec, TX_8X8);

-    a[1] = a[0] = above_ec;

-    l[1] = l[0] = left_ec;

+    for (i = 0; i < 1 << bwl; i += 1 << tx_size) {

+      int c = 0;

+      ctx->ta[p][i] = 0;

+      for (j = 0; j < 1 << tx_size && !c; j++) {

+        c = ctx->ta[p][i] |= plane->above_context[i + j];

+      }

+    }

+    for (i = 0; i < 1 << bhl; i += 1 << tx_size) {

+      int c = 0;

+      ctx->tl[p][i] = 0;

+      for (j = 0; j < 1 << tx_size && !c; j++) {

+        c = ctx->tl[p][i] |= plane->left_context[i + j];

+      }

+    }

-void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int b;

-  ENTROPY_CONTEXT *const ta = (ENTROPY_CONTEXT *)x->e_mbd.above_context;

-  ENTROPY_CONTEXT *const tl = (ENTROPY_CONTEXT *)x->e_mbd.left_context;

-  if (!ta || !tl)

-    return;

-  for (b = 16; b < 24; b += 4) {

-    ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];

-    ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];

-    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;

-    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,

-               &above_ec, &left_ec, TX_8X8);

-  }

+void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  struct optimize_ctx ctx;

+  struct optimize_block_args arg = {cm, x, &ctx};

+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);

+  foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg);

-static void optimize_mb_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  vp9_optimize_mby_8x8(cm, x);

-  vp9_optimize_mbuv_8x8(cm, x);

+void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,

+                       BLOCK_SIZE_TYPE bsize) {

+  struct optimize_ctx ctx;

+  struct optimize_block_args arg = {cm, x, &ctx};

+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);

+  foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);

-void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT_PLANES *const t_above = x->e_mbd.above_context;

-  ENTROPY_CONTEXT_PLANES *const t_left = x->e_mbd.left_context;

-  ENTROPY_CONTEXT ta, tl;

+struct encode_b_args {

+  VP9_COMMON *cm;

+  MACROBLOCK *x;

+  struct optimize_ctx *ctx;

+};

-  if (!t_above || !t_left)

-    return;

+static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                         int ss_txfrm_size, void *arg) {

+  struct encode_b_args* const args = arg;

+  MACROBLOCK* const x = args->x;

+  MACROBLOCKD* const xd = &x->e_mbd;

+  const int bw = plane_block_width(bsize, &xd->plane[plane]);

+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,

+                                                       block, ss_txfrm_size);

+  int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16);

+  int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,

+                                                      raster_block,

+                                                      x->plane[plane].src_diff);

+  TX_TYPE tx_type = DCT_DCT;

-  ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;

-  tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;

-  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-             &ta, &tl, TX_16X16);

-}

-static void optimize_mb_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  vp9_optimize_mby_16x16(cm, x);

-  vp9_optimize_mbuv_8x8(cm, x);

-}

-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

-  ENTROPY_CONTEXT ta, tl;

-  ta = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;

-  tl = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;

-  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-             &ta, &tl, TX_32X32);

-}

-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

-  ENTROPY_CONTEXT ta[2], tl[2];

-  int n;

-  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;

-  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;

-  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;

-  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-               ta + x_idx, tl + y_idx, TX_16X16);

+  switch (ss_txfrm_size / 2) {

+    case TX_32X32:

+      vp9_short_fdct32x32(src_diff, coeff, bw * 2);

+      break;

+    case TX_16X16:

+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

+      if (tx_type != DCT_DCT)

+        vp9_short_fht16x16(src_diff, coeff, bw, tx_type);

+      else

+        x->fwd_txm16x16(src_diff, coeff, bw * 2);

+      break;

+    case TX_8X8:

+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;

+      if (tx_type != DCT_DCT)

+        vp9_short_fht8x8(src_diff, coeff, bw, tx_type);

+      else

+        x->fwd_txm8x8(src_diff, coeff, bw * 2);

+      break;

+    case TX_4X4:

+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;

+      if (tx_type != DCT_DCT)

+        vp9_short_fht4x4(src_diff, coeff, bw, tx_type);

+      else

+        x->fwd_txm4x4(src_diff, coeff, bw * 2);

+      break;

+    default:

+      assert(0);

-}

-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

-  ENTROPY_CONTEXT ta[4], tl[4];

-  int n;

-  ta[0] = (a[0] + a[1]) != 0;

-  ta[1] = (a[2] + a[3]) != 0;

-  ta[2] = (a1[0] + a1[1]) != 0;

-  ta[3] = (a1[2] + a1[3]) != 0;

-  tl[0] = (l[0] + l[1]) != 0;

-  tl[1] = (l[2] + l[3]) != 0;

-  tl[2] = (l1[0] + l1[1]) != 0;

-  tl[3] = (l1[2] + l1[3]) != 0;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

-    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-               ta + x_idx, tl + y_idx, TX_8X8);

-  }

+  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);

-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT ta[8], tl[8];

-  int n;

+static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                         int ss_txfrm_size, void *arg) {

+  struct encode_b_args *const args = arg;

+  MACROBLOCK *const x = args->x;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,

+                                                       block, ss_txfrm_size);

+  struct macroblockd_plane *const pd = &xd->plane[plane];

+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);

+  uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane,

+                                                 raster_block,

+                                                 pd->dst.buf, pd->dst.stride);

+  TX_TYPE tx_type = DCT_DCT;

-  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3;

+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);

-    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-               ta + x_idx, tl + y_idx, TX_4X4);

-  }

-}

+  if (x->optimize)

+    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);

-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;

-  int b;

-  for (b = 64; b < 96; b += 16) {

-    const int cidx = b >= 80 ? 20 : 16;

-    a = ta + vp9_block2above_sb[TX_16X16][b];

-    l = tl + vp9_block2left_sb[TX_16X16][b];

-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

-    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

-               &above_ec, &left_ec, TX_16X16);

+  switch (ss_txfrm_size / 2) {

+    case TX_32X32:

+      vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);

+      break;

+    case TX_16X16:

+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);

+      else

+        vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);

+      break;

+    case TX_8X8:

+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);

+      else

+        vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);

+      break;

+    case TX_4X4:

+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        // this is like vp9_short_idct4x4 but has a special case around eob<=1

+        // which is significant (not just an optimization) for the lossless

+        // case.

+        vp9_inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,

+                                        dst, pd->dst.stride);

+      else

+        vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);

+      break;

-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

-  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;

-  int b;

+void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD* const xd = &x->e_mbd;

+  struct encode_b_args arg = {cm, x, NULL};

-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

-  for (b = 64; b < 96; b += 4) {

-    const int cidx = b >= 80 ? 20 : 16;

-    a = ta + vp9_block2above_sb[TX_8X8][b];

-    l = tl + vp9_block2left_sb[TX_8X8][b];

-    above_ec = (a[0] + a[1]) != 0;

-    left_ec = (l[0] + l[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

-               &above_ec, &left_ec, TX_8X8);

-    a[0] = a[1] = above_ec;

-    l[0] = l[1] = left_ec;

-  }

+  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);

-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

-  ENTROPY_CONTEXT *a, *l;

-  int b;

+void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x,

+                          BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD* const xd = &x->e_mbd;

+  struct encode_b_args arg = {cm, x, NULL};

-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

-  for (b = 64; b < 96; b++) {

-    const int cidx = b >= 80 ? 20 : 16;

-    a = ta + vp9_block2above_sb[TX_4X4][b];

-    l = tl + vp9_block2left_sb[TX_4X4][b];

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

-               a, l, TX_4X4);

-  }

+  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);

-void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

-  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);

-  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);

-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

-  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);

-  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);

-  ENTROPY_CONTEXT ta[2], tl[2];

-  int n;

+void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  struct optimize_ctx ctx;

+  struct encode_b_args arg = {cm, x, &ctx};

-  ta[0] = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;

-  ta[1] = (a2[0] + a2[1] + a2[2] + a2[3] + a3[0] + a3[1] + a3[2] + a3[3]) != 0;

-  tl[0] = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;

-  tl[1] = (l2[0] + l2[1] + l2[2] + l2[3] + l3[0] + l3[1] + l3[2] + l3[3]) != 0;

-  for (n = 0; n < 4; n++) {

-    const int x_idx = n & 1, y_idx = n >> 1;

+  vp9_subtract_sby(x, bsize);

+  if (x->optimize)

+    vp9_optimize_init(xd, bsize, &ctx);

-    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-               ta + x_idx, tl + y_idx, TX_32X32);

-  }

+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);

-void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

-  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);

-  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);

-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

-  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);

-  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);

-  ENTROPY_CONTEXT ta[4], tl[4];

-  int n;

+void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  struct optimize_ctx ctx;

+  struct encode_b_args arg = {cm, x, &ctx};

-  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;

-  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;

-  ta[2] = (a2[0] + a2[1] + a2[2] + a2[3]) != 0;

-  ta[3] = (a3[0] + a3[1] + a3[2] + a3[3]) != 0;

-  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;

-  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;

-  tl[2] = (l2[0] + l2[1] + l2[2] + l2[3]) != 0;

-  tl[3] = (l3[0] + l3[1] + l3[2] + l3[3]) != 0;

-  for (n = 0; n < 16; n++) {

-    const int x_idx = n & 3, y_idx = n >> 2;

+  vp9_subtract_sbuv(x, bsize);

+  if (x->optimize)

+    vp9_optimize_init(xd, bsize, &ctx);

-    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-               ta + x_idx, tl + y_idx, TX_16X16);

-  }

+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

-void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

-  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);

-  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);

-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

-  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);

-  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);

-  ENTROPY_CONTEXT ta[8], tl[8];

-  int n;

+void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  struct optimize_ctx ctx;

+  struct encode_b_args arg = {cm, x, &ctx};

-  ta[0] = (a[0] + a[1]) != 0;

-  ta[1] = (a[2] + a[3]) != 0;

-  ta[2] = (a1[0] + a1[1]) != 0;

-  ta[3] = (a1[2] + a1[3]) != 0;

-  ta[4] = (a2[0] + a2[1]) != 0;

-  ta[5] = (a2[2] + a2[3]) != 0;

-  ta[6] = (a3[0] + a3[1]) != 0;

-  ta[7] = (a3[2] + a3[3]) != 0;

-  tl[0] = (l[0] + l[1]) != 0;

-  tl[1] = (l[2] + l[3]) != 0;

-  tl[2] = (l1[0] + l1[1]) != 0;

-  tl[3] = (l1[2] + l1[3]) != 0;

-  tl[4] = (l2[0] + l2[1]) != 0;

-  tl[5] = (l2[2] + l2[3]) != 0;

-  tl[6] = (l3[0] + l3[1]) != 0;

-  tl[7] = (l3[2] + l3[3]) != 0;

-  for (n = 0; n < 64; n++) {

-    const int x_idx = n & 7, y_idx = n >> 3;

+  vp9_subtract_sb(x, bsize);

+  if (x->optimize)

+    vp9_optimize_init(xd, bsize, &ctx);

-    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-               ta + x_idx, tl + y_idx, TX_8X8);

-  }

+  foreach_transformed_block(xd, bsize, encode_block, &arg);

-void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT ta[16], tl[16];

-  int n;

+static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                               int ss_txfrm_size, void *arg) {

+  struct encode_b_args* const args = arg;

+  MACROBLOCK *const x = args->x;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);

+  struct macroblock_plane *const p = &x->plane[plane];

+  struct macroblockd_plane *const pd = &xd->plane[plane];

+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);

+  const int bw = plane_block_width(bsize, pd);

+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,

+                                                       block, ss_txfrm_size);

-  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(ta + 8, x->e_mbd.above_context + 2, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(ta + 12, x->e_mbd.above_context + 3, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(tl + 8, x->e_mbd.left_context + 2, 4 * sizeof(ENTROPY_CONTEXT));

-  vpx_memcpy(tl + 12, x->e_mbd.left_context + 3, 4 * sizeof(ENTROPY_CONTEXT));

-  for (n = 0; n < 256; n++) {

-    const int x_idx = n & 15, y_idx = n >> 4;

+  uint8_t *const src = raster_block_offset_uint8(xd, bsize, plane, raster_block,

+                                                 p->src.buf, p->src.stride);

+  uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block,

+                                                 pd->dst.buf, pd->dst.stride);

+  int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,

+                                                      raster_block,

+                                                      p->src_diff);

-    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

-               ta + x_idx, tl + y_idx, TX_4X4);

-  }

-}

+  const int txfm_b_size = 4 << tx_size;

+  int ib = raster_block;

+  int tx_ib = ib >> tx_size;

+  int plane_b_size;

-void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

-  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;

-  int b;

+  TX_TYPE tx_type;

+  int mode, b_mode;

-  for (b = 256; b < 384; b += 64) {

-    const int cidx = b >= 320 ? 20 : 16;

-    a = ta + vp9_block2above_sb64[TX_32X32][b];

-    l = tl + vp9_block2left_sb64[TX_32X32][b];

-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a2 = a + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l2 = l + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a3 = a + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0;

-    l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

-               &a_ec, &l_ec, TX_32X32);

+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {

+    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);

-}

-void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

-  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;

-  int b;

+  mode = plane == 0? mbmi->mode: mbmi->uv_mode;

+  if (plane == 0 &&

+      mbmi->sb_type < BLOCK_SIZE_SB8X8 &&

+      mbmi->ref_frame[0] == INTRA_FRAME)

+    b_mode = xd->mode_info_context->bmi[ib].as_mode.first;

+  else

+    b_mode = mode;

-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

-  for (b = 256; b < 384; b += 16) {

-    const int cidx = b >= 320 ? 20 : 16;

-    a = ta + vp9_block2above_sb64[TX_16X16][b];

-    l = tl + vp9_block2left_sb64[TX_16X16][b];

-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

-    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

-               &above_ec, &left_ec, TX_16X16);

-    a[0] = a[1] = a1[0] = a1[1] = above_ec;

-    l[0] = l[1] = l1[0] = l1[1] = left_ec;

-  }

-}

+  assert(b_mode >= DC_PRED && b_mode <= TM_PRED);

-void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

-  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;

-  int b;

+  plane_b_size = b_width_log2(bsize) - pd->subsampling_x;

+  vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,

+                          dst, pd->dst.stride);

+  vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,

+                     src, p->src.stride, dst, pd->dst.stride);

-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

-  for (b = 256; b < 384; b += 4) {

-    const int cidx = b >= 320 ? 20 : 16;

-    a = ta + vp9_block2above_sb64[TX_8X8][b];

-    l = tl + vp9_block2left_sb64[TX_8X8][b];

-    above_ec = (a[0] + a[1]) != 0;

-    left_ec = (l[0] + l[1]) != 0;

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

-               &above_ec, &left_ec, TX_8X8);

-    a[0] = a[1] = above_ec;

-    l[0] = l[1] = left_ec;

-  }

-}

+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);

-void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

-  ENTROPY_CONTEXT *a, *l;

-  int b;

-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

-  for (b = 256; b < 384; b++) {

-    const int cidx = b >= 320 ? 20 : 16;

-    a = ta + vp9_block2above_sb64[TX_4X4][b];

-    l = tl + vp9_block2left_sb64[TX_4X4][b];

-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

-               a, l, TX_4X4);

-  }

-}

+  // if (x->optimize)

+  // vp9_optimize_b(plane, block, bsize, ss_txfrm_size,

+  //                args->cm, x, args->ctx);

-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  if (tx_size == TX_16X16) {

-    vp9_transform_mb_16x16(x);

-    vp9_quantize_mb_16x16(x);

-    if (x->optimize)

-      optimize_mb_16x16(cm, x);

-    vp9_inverse_transform_mb_16x16(xd);

-  } else if (tx_size == TX_8X8) {

-    if (xd->mode_info_context->mbmi.mode == SPLITMV) {

-      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);

-      vp9_transform_mby_8x8(x);

-      vp9_transform_mbuv_4x4(x);

-      vp9_quantize_mby_8x8(x);

-      vp9_quantize_mbuv_4x4(x);

-      if (x->optimize) {

-        vp9_optimize_mby_8x8(cm, x);

-        vp9_optimize_mbuv_4x4(cm, x);

-      }

-      vp9_inverse_transform_mby_8x8(xd);

-      vp9_inverse_transform_mbuv_4x4(xd);

-    } else {

-      vp9_transform_mb_8x8(x);

-      vp9_quantize_mb_8x8(x);

-      if (x->optimize)

-        optimize_mb_8x8(cm, x);

-      vp9_inverse_transform_mb_8x8(xd);

-    }

-  } else {

-    transform_mb_4x4(x);

-    vp9_quantize_mb_4x4(x);

-    if (x->optimize)

-      optimize_mb_4x4(cm, x);

-    vp9_inverse_transform_mb_4x4(xd);

+  switch (ss_txfrm_size / 2) {

+    case TX_32X32:

+        vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);

+      break;

+    case TX_16X16:

+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);

+      else

+        vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);

+      break;

+    case TX_8X8:

+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);

+      else

+        vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);

+      break;

+    case TX_4X4:

+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;

+      if (tx_type == DCT_DCT)

+        // this is like vp9_short_idct4x4 but has a special case around eob<=1

+        // which is significant (not just an optimization) for the lossless

+        // case.

+        vp9_inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,

+                                        dst, pd->dst.stride);

+      else

+        vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);

+      break;

-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,

-                           int mb_row, int mb_col) {

-  MACROBLOCKD *const xd = &x->e_mbd;

+void vp9_encode_intra_block_y(VP9_COMMON *cm, MACROBLOCK *x,

+                              BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD* const xd = &x->e_mbd;

+  struct optimize_ctx ctx;

+  struct encode_b_args arg = {cm, x, &ctx};

-  vp9_build_inter_predictors_mb(xd, mb_row, mb_col);

-  subtract_mb(x);

-  vp9_fidct_mb(cm, x);

-  vp9_recon_mb(xd);

+  foreach_transformed_block_in_plane(xd, bsize, 0,

+                                     encode_block_intra, &arg);

-/* this function is used by first pass only */

-void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);

-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

-  vp9_transform_mby_4x4(x);

-  vp9_quantize_mby_4x4(x);

-  vp9_inverse_transform_mby_4x4(xd);

-  vp9_recon_mby(xd);

+void vp9_encode_intra_block_uv(VP9_COMMON *cm, MACROBLOCK *x,

+                              BLOCK_SIZE_TYPE bsize) {

+  MACROBLOCKD* const xd = &x->e_mbd;

+  struct optimize_ctx ctx;

+  struct encode_b_args arg = {cm, x, &ctx};

+  foreach_transformed_block_uv(xd, bsize, encode_block_intra, &arg);

--- a/vp9/encoder/vp9_encodemb.h

+++ b/vp9/encoder/vp9_encodemb.h

@@ -22,82 +22,32 @@

   MV_REFERENCE_FRAME second_ref_frame;

 } MODE_DEFINITION;

+struct optimize_ctx {

+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];

+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];

+};

-struct VP9_ENCODER_RTCD;

-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,

-                           int mb_row, int mb_col);

+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,

+                       struct optimize_ctx *ctx);

+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *x,

+                    struct optimize_ctx *ctx);

+void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_optimize_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

-void vp9_transform_mbuv_4x4(MACROBLOCK *x);

-void vp9_transform_mby_4x4(MACROBLOCK *x);

+void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

-void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);

+void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

-void vp9_transform_mb_8x8(MACROBLOCK *mb);

-void vp9_transform_mby_8x8(MACROBLOCK *x);

-void vp9_transform_mbuv_8x8(MACROBLOCK *x);

-void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_mb_16x16(MACROBLOCK *mb);

-void vp9_transform_mby_16x16(MACROBLOCK *x);

-void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sby_32x32(MACROBLOCK *x);

-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sby_16x16(MACROBLOCK *x);

-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sby_8x8(MACROBLOCK *x);

-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sby_4x4(MACROBLOCK *x);

-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sbuv_16x16(MACROBLOCK *x);

-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sbuv_8x8(MACROBLOCK *x);

-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sbuv_4x4(MACROBLOCK *x);

-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64y_32x32(MACROBLOCK *x);

-void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64y_16x16(MACROBLOCK *x);

-void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64y_8x8(MACROBLOCK *x);

-void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64y_4x4(MACROBLOCK *x);

-void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64uv_32x32(MACROBLOCK *x);

-void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64uv_16x16(MACROBLOCK *x);

-void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64uv_8x8(MACROBLOCK *x);

-void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_transform_sb64uv_4x4(MACROBLOCK *x);

-void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);

-void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,

-                           const uint8_t *vsrc, int src_stride,

-                           const uint8_t *upred,

-                           const uint8_t *vpred, int dst_stride);

-void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src,

-                          int src_stride, const uint8_t *pred,

-                          int dst_stride);

-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,

-                          const uint8_t *pred, int dst_stride);

-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,

-                           const uint8_t *vsrc, int src_stride,

-                           const uint8_t *upred,

-                           const uint8_t *vpred, int dst_stride);

-void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,

-                            const uint8_t *pred, int dst_stride);

-void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,

-                             const uint8_t *vsrc, int src_stride,

-                             const uint8_t *upred,

-                             const uint8_t *vpred, int dst_stride);

+void vp9_subtract_block(int rows, int cols,

+                        int16_t *diff_ptr, int diff_stride,

+                        const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *pred_ptr, int pred_stride);

+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);

 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_

--- a/vp9/encoder/vp9_encodemv.c

+++ b/vp9/encoder/vp9_encodemv.c

@@ -24,68 +24,48 @@

 nmv_context_counts tnmvcounts;

 #endif

-static void encode_nmv_component(vp9_writer* const bc,

-                                 int v,

-                                 int r,

-                                 const nmv_component* const mvcomp) {

-  int s, z, c, o, d;

-  assert (v != 0);            /* should not be zero */

-  s = v < 0;

-  vp9_write(bc, s, mvcomp->sign);

-  z = (s ? -v : v) - 1;       /* magnitude - 1 */

+static void encode_mv_component(vp9_writer* w, int comp,

+                                const nmv_component* mvcomp, int usehp) {

+  int offset;

+  const int sign = comp < 0;

+  const int mag = sign ? -comp : comp;

+  const int mv_class = vp9_get_mv_class(mag - 1, &offset);

+  const int d = offset >> 3;                // int mv data

+  const int fr = (offset >> 1) & 3;         // fractional mv data

+  const int hp = offset & 1;                // high precision mv data

-  c = vp9_get_mv_class(z, &o);

+  assert(comp != 0);

-  write_token(bc, vp9_mv_class_tree, mvcomp->classes,

-              vp9_mv_class_encodings + c);

+  // Sign

+  vp9_write(w, sign, mvcomp->sign);

-  d = (o >> 3);               /* int mv data */

+  // Class

+  write_token(w, vp9_mv_class_tree, mvcomp->classes,

+              &vp9_mv_class_encodings[mv_class]);

-  if (c == MV_CLASS_0) {

-    write_token(bc, vp9_mv_class0_tree, mvcomp->class0,

-                vp9_mv_class0_encodings + d);

+  // Integer bits

+  if (mv_class == MV_CLASS_0) {

+    write_token(w, vp9_mv_class0_tree, mvcomp->class0,

+                &vp9_mv_class0_encodings[d]);

   } else {

-    int i, b;

-    b = c + CLASS0_BITS - 1;  /* number of bits */

-    for (i = 0; i < b; ++i)

-      vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);

+    int i;

+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits

+    for (i = 0; i < n; ++i)

+      vp9_write(w, (d >> i) & 1, mvcomp->bits[i]);

-}

-static void encode_nmv_component_fp(vp9_writer *bc,

-                                    int v,

-                                    int r,

-                                    const nmv_component* const mvcomp,

-                                    int usehp) {

-  int s, z, c, o, d, f, e;

-  assert (v != 0);            /* should not be zero */

-  s = v < 0;

-  z = (s ? -v : v) - 1;       /* magnitude - 1 */

+  // Fractional bits

+  write_token(w, vp9_mv_fp_tree,

+              mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,

+              &vp9_mv_fp_encodings[fr]);

-  c = vp9_get_mv_class(z, &o);

-  d = (o >> 3);               /* int mv data */

-  f = (o >> 1) & 3;           /* fractional pel mv data */

-  e = (o & 1);                /* high precision mv data */

-  /* Code the fractional pel bits */

-  if (c == MV_CLASS_0) {

-    write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],

-                vp9_mv_fp_encodings + f);

-  } else {

-    write_token(bc, vp9_mv_fp_tree, mvcomp->fp,

-                vp9_mv_fp_encodings + f);

-  }

-  /* Code the high precision bit */

-  if (usehp) {

-    if (c == MV_CLASS_0) {

-      vp9_write(bc, e, mvcomp->class0_hp);

-    } else {

-      vp9_write(bc, e, mvcomp->hp);

-    }

-  }

+  // High precision bit

+  if (usehp)

+    vp9_write(w, hp,

+              mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);

 static void build_nmv_component_cost_table(int *mvcost,

                                            const nmv_component* const mvcomp,

                                            int usehp) {

@@ -556,30 +536,19 @@

-void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,

-                    const MV* const ref, const nmv_context* const mvctx) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

-  write_token(bc, vp9_mv_joint_tree, mvctx->joints,

-              vp9_mv_joint_encodings + j);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);

-  }

-}

-void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,

-                       const MV* const ref, const nmv_context* const mvctx,

-                       int usehp) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

+void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,

+                   const nmv_context* mvctx, int usehp) {

+  const MV diff = {mv->row - ref->row,

+                   mv->col - ref->col};

+  const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);

   usehp = usehp && vp9_use_nmv_hp(ref);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);

-  }

+  write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);

+  if (mv_joint_vertical(j))

+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);

+  if (mv_joint_horizontal(j))

+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);

 void vp9_build_nmv_cost_table(int *mvjoint,

@@ -600,62 +569,42 @@

                          int_mv *best_ref_mv, int_mv *second_best_ref_mv) {

   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

   MV mv;

+  int bwl = b_width_log2(mbmi->sb_type), bw = 1 << bwl;

+  int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl;

+  int idx, idy;

-  if (mbmi->mode == SPLITMV) {

+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {

     int i;

-    for (i = 0; i < x->partition_info->count; i++) {

-      if (x->partition_info->bmi[i].mode == NEW4X4) {

-        if (x->e_mbd.allow_high_precision_mv) {

-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row

-                    - best_ref_mv->as_mv.row);

-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col

-                    - best_ref_mv->as_mv.col);

-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);

-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame > 0) {

-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row

-                      - second_best_ref_mv->as_mv.row);

-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col

-                      - second_best_ref_mv->as_mv.col);

-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,

-                              &cpi->NMVcount, 1);

+    PARTITION_INFO *pi = x->partition_info;

+    for (idy = 0; idy < 2; idy += bh) {

+      for (idx = 0; idx < 2; idx += bw) {

+        i = idy * 2 + idx;

+        if (pi->bmi[i].mode == NEWMV) {

+          mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row);

+          mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col);

+          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,

+                            x->e_mbd.allow_high_precision_mv);

+          if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {

+            mv.row = pi->bmi[i].second_mv.as_mv.row -

+                         second_best_ref_mv->as_mv.row;

+            mv.col = pi->bmi[i].second_mv.as_mv.col -

+                         second_best_ref_mv->as_mv.col;

+            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,

+                              x->e_mbd.allow_high_precision_mv);

-        } else {

-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row

-                    - best_ref_mv->as_mv.row);

-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col

-                    - best_ref_mv->as_mv.col);

-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);

-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame > 0) {

-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row

-                      - second_best_ref_mv->as_mv.row);

-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col

-                      - second_best_ref_mv->as_mv.col);

-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,

-                              &cpi->NMVcount, 0);

-          }

   } else if (mbmi->mode == NEWMV) {

-    if (x->e_mbd.allow_high_precision_mv) {

-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);

-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);

-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);

-      if (mbmi->second_ref_frame > 0) {

-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);

-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);

-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);

-      }

-    } else {

-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);

-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);

-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);

-      if (mbmi->second_ref_frame > 0) {

-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);

-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);

-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);

-      }

+    mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);

+    mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);

+    vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,

+                      x->e_mbd.allow_high_precision_mv);

+    if (mbmi->ref_frame[1] > INTRA_FRAME) {

+      mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);

+      mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);

+      vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,

+                        x->e_mbd.allow_high_precision_mv);

--- a/vp9/encoder/vp9_encodemv.h

+++ b/vp9/encoder/vp9_encodemv.h

@@ -15,11 +15,10 @@

 #include "vp9/encoder/vp9_onyx_int.h"

 void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);

-void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,

-                    const MV* const ref, const nmv_context* const mvctx);

-void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,

-                       const MV* const ref, const nmv_context* const mvctx,

-                       int usehp);

+void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,

+                   const nmv_context* mvctx, int usehp);

 void vp9_build_nmv_cost_table(int *mvjoint,

                               int *mvcost[2],

                               const nmv_context* const mvctx,

--- a/vp9/encoder/vp9_firstpass.c

+++ b/vp9/encoder/vp9_firstpass.c

@@ -14,7 +14,6 @@

 #include "vp9/encoder/vp9_onyx_int.h"

 #include "vp9/encoder/vp9_variance.h"

 #include "vp9/encoder/vp9_encodeintra.h"

-#include "vp9/common/vp9_setupintrarecon.h"

 #include "vp9/encoder/vp9_mcomp.h"

 #include "vp9/encoder/vp9_firstpass.h"

 #include "vpx_scale/vpx_scale.h"

@@ -23,7 +22,7 @@

 #include "vp9/common/vp9_extend.h"

 #include "vp9/common/vp9_systemdependent.h"

 #include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_swapyv12buffer.h"

+#include "vpx_scale/yv12config.h"

 #include <stdio.h>

 #include "vp9/encoder/vp9_quantize.h"

 #include "vp9/encoder/vp9_rdopt.h"

@@ -32,6 +31,8 @@

 #include "vp9/common/vp9_entropymv.h"

 #include "vp9/encoder/vp9_encodemv.h"

 #include "./vpx_scale_rtcd.h"

+// TODO(jkoleszar): for setup_dst_planes

+#include "vp9/common/vp9_reconinter.h"

 #define OUTPUT_FPF 0

@@ -38,7 +39,7 @@

 #define IIFACTOR   12.5

 #define IIKFACTOR1 12.5

 #define IIKFACTOR2 15.0

-#define RMAX       128.0

+#define RMAX       512.0

 #define GF_RMAX    96.0

 #define ERR_DIVISOR   150.0

 #define MIN_DECAY_FACTOR 0.1

@@ -46,11 +47,17 @@

 #define KF_MB_INTRA_MIN 150

 #define GF_MB_INTRA_MIN 100

-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)

+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)

 #define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0

 #define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0

+static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {

+  YV12_BUFFER_CONFIG temp = *a;

+  *a = *b;

+  *b = temp;

+}

 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);

 static int select_cq_level(int qindex) {

@@ -71,8 +78,8 @@

 // Resets the first pass file to the given position using a relative seek from the current position

-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {

-  cpi->twopass.stats_in = Position;

+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {

+  cpi->twopass.stats_in = position;

 static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {

@@ -128,7 +135,7 @@

     FILE *fpfile;

     fpfile = fopen("firstpass.stt", "a");

-    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"

+    fprintf(stdout, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"

             "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"

             "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",

             stats->frame,

@@ -245,17 +252,11 @@

 // Calculate a modified Error used in distributing bits between easier and harder frames

 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /

-                   cpi->twopass.total_stats->count);

-  double this_err = this_frame->ssim_weighted_pred_err;

-  double modified_err;

-  if (this_err > av_err)

-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);

-  else

-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);

-  return modified_err;

+  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;

+  const double av_err = stats->ssim_weighted_pred_err / stats->count;

+  const double this_err = this_frame->ssim_weighted_pred_err;

+  return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),

+                      this_err > av_err ? POW1 : POW2);

 static const double weight_table[256] = {

@@ -317,46 +318,69 @@

-// This function returns the current per frame maximum bitrate target

+// This function returns the current per frame maximum bitrate target.

 static int frame_max_bits(VP9_COMP *cpi) {

-  // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left

-  int max_bits;

+  // Max allocation for a single frame based on the max section guidelines

+  // passed in and how many bits are left.

+  // For VBR base this on the bits and frames left plus the

+  // two_pass_vbrmax_section rate passed in by the user.

+  const double max_bits = (1.0 * cpi->twopass.bits_left /

+      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *

+      (cpi->oxcf.two_pass_vbrmax_section / 100.0);

-  // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user

-  max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));

-  // Trap case where we are out of bits

-  if (max_bits < 0)

-    max_bits = 0;

-  return max_bits;

+  // Trap case where we are out of bits.

+  return MAX((int)max_bits, 0);

 void vp9_init_first_pass(VP9_COMP *cpi) {

-  zero_stats(cpi->twopass.total_stats);

+  zero_stats(&cpi->twopass.total_stats);

 void vp9_end_first_pass(VP9_COMP *cpi) {

-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);

+  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);

 static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {

   MACROBLOCKD *const xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &x->e_mbd.block[0];

-  uint8_t *src_ptr = (*(b->base_src) + b->src);

-  int src_stride = b->src_stride;

-  uint8_t *ref_ptr;

-  int ref_stride = d->pre_stride;

   // Set up pointers for this macro block recon buffer

-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

+  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;

-  ref_ptr = (uint8_t *)(*(d->base_pre) + d->pre);

+  switch (xd->mode_info_context->mbmi.sb_type) {

+    case BLOCK_SIZE_SB8X8:

+      vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,

+                 xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,

+                 (unsigned int *)(best_motion_err));

+      break;

+    case BLOCK_SIZE_SB16X8:

+      vp9_mse16x8(x->plane[0].src.buf, x->plane[0].src.stride,

+                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,

+                  (unsigned int *)(best_motion_err));

+      break;

+    case BLOCK_SIZE_SB8X16:

+      vp9_mse8x16(x->plane[0].src.buf, x->plane[0].src.stride,

+                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,

+                  (unsigned int *)(best_motion_err));

+      break;

+    default:

+      vp9_mse16x16(x->plane[0].src.buf, x->plane[0].src.stride,

+                   xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,

+                   (unsigned int *)(best_motion_err));

+      break;

+  }

+}

-  vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,

-               (unsigned int *)(best_motion_err));

+static enum BlockSize get_bs(BLOCK_SIZE_TYPE b) {

+  switch (b) {

+    case BLOCK_SIZE_SB8X8:

+      return BLOCK_8X8;

+    case BLOCK_SIZE_SB16X8:

+      return BLOCK_16X8;

+    case BLOCK_SIZE_SB8X16:

+      return BLOCK_8X16;

+    default:

+      return BLOCK_16X16;

+  }

 static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,

@@ -364,8 +388,6 @@

                                      YV12_BUFFER_CONFIG *recon_buffer,

                                      int *best_motion_err, int recon_yoffset) {

   MACROBLOCKD *const xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &x->e_mbd.block[0];

   int num00;

   int_mv tmp_mv;

@@ -375,7 +397,8 @@

   int step_param = 3;

   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

   int n;

-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

+  vp9_variance_fn_ptr_t v_fn_ptr =

+      cpi->fn_ptr[get_bs(xd->mode_info_context->mbmi.sb_type)];

   int new_mv_mode_penalty = 256;

   int sr = 0;

@@ -392,16 +415,29 @@

   further_steps -= sr;

   // override the default variance function to use MSE

-  v_fn_ptr.vf = vp9_mse16x16;

+  switch (xd->mode_info_context->mbmi.sb_type) {

+    case BLOCK_SIZE_SB8X8:

+      v_fn_ptr.vf = vp9_mse8x8;

+      break;

+    case BLOCK_SIZE_SB16X8:

+      v_fn_ptr.vf = vp9_mse16x8;

+      break;

+    case BLOCK_SIZE_SB8X16:

+      v_fn_ptr.vf = vp9_mse8x16;

+      break;

+    default:

+      v_fn_ptr.vf = vp9_mse16x16;

+      break;

+  }

   // Set up pointers for this macro block recon buffer

-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

+  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;

   // Initial step/diamond search centred on best mv

   tmp_mv.as_int = 0;

   ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;

   ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;

-  tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,

+  tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param,

                                     x->sadperbit16, &num00, &v_fn_ptr,

                                     x->nmvjointcost,

                                     x->mvcost, ref_mv);

@@ -424,7 +460,7 @@

     if (num00)

       num00--;

     else {

-      tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,

+      tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,

                                         step_param + n, x->sadperbit16,

                                         &num00, &v_fn_ptr,

                                         x->nmvjointcost,

@@ -448,13 +484,13 @@

   MACROBLOCKD *const xd = &x->e_mbd;

   int recon_yoffset, recon_uvoffset;

-  YV12_BUFFER_CONFIG *lst_yv12 =

-      &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];

-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

-  YV12_BUFFER_CONFIG *gld_yv12 =

-      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];

-  int recon_y_stride = lst_yv12->y_stride;

-  int recon_uv_stride = lst_yv12->uv_stride;

+  const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];

+  const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];

+  YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];

+  YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

+  YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];

+  const int recon_y_stride = lst_yv12->y_stride;

+  const int recon_uv_stride = lst_yv12->uv_stride;

   int64_t intra_error = 0;

   int64_t coded_error = 0;

   int64_t sr_coded_error = 0;

@@ -477,9 +513,9 @@

   vp9_clear_system_state();  // __asm emms;

-  x->src = * cpi->Source;

-  xd->pre = *lst_yv12;

-  xd->dst = *new_yv12;

+  vp9_setup_src_planes(x, cpi->Source, 0, 0);

+  setup_pre_planes(xd, lst_yv12, NULL, 0, 0, NULL, NULL);

+  setup_dst_planes(xd, new_yv12, 0, 0);

   x->partition_info = x->pi;

@@ -487,12 +523,8 @@

   vp9_build_block_offsets(x);

-  vp9_setup_block_dptrs(&x->e_mbd);

+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);

-  vp9_setup_block_ptrs(x);

-  // set up frame new frame for intra coded blocks

-  vp9_setup_intra_recon(new_yv12);

   vp9_frame_init_quantizer(cpi);

   // Initialise the MV cost table to the defaults

@@ -500,7 +532,7 @@

   // if ( 0 )

     vp9_init_mv_probs(cm);

-    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

+    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);

   // for each macroblock row in image

@@ -515,11 +547,10 @@

     recon_uvoffset = (mb_row * recon_uv_stride * 8);

     // Set up limit values for motion vectors to prevent them extending outside the UMV borders

-    x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 16));

+    x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 8));

     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)

-                    + (VP9BORDERINPIXELS - 16);

+                    + (VP9BORDERINPIXELS - 8);

     // for each macroblock col in image

     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

       int this_error;

@@ -526,11 +557,31 @@

       int gf_motion_error = INT_MAX;

       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;

-      xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;

-      xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;

+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;

+      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;

+      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;

       xd->left_available = (mb_col != 0);

+      if (mb_col * 2 + 1 < cm->mi_cols) {

+        if (mb_row * 2 + 1 < cm->mi_rows) {

+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;

+        } else {

+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB16X8;

+        }

+      } else {

+        if (mb_row * 2 + 1 < cm->mi_rows) {

+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB8X16;

+        } else {

+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB8X8;

+        }

+      }

+      xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;

+      set_mi_row_col(cm, xd,

+                     mb_row << 1,

+                     1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type),

+                     mb_col << 1,

+                     1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type));

       // do intra 16x16 prediction

       this_error = vp9_encode_intra(cpi, x, use_dc_pred);

@@ -544,9 +595,9 @@

       intra_error += (int64_t)this_error;

       // Set up limit values for motion vectors to prevent them extending outside the UMV borders

-      x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 16));

+      x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 8));

       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)

-                      + (VP9BORDERINPIXELS - 16);

+                      + (VP9BORDERINPIXELS - 8);

       // Other than for the first frame do a motion search

       if (cm->current_video_frame > 0) {

@@ -592,9 +643,9 @@

           // Reset to last frame as reference buffer

-          xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;

-          xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;

-          xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;

+          xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;

+          xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset;

+          xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset;

           // In accumulating a score for the older reference frame

           // take the best of the motion predicted score and

@@ -626,7 +677,12 @@

           this_error = motion_error;

           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);

           xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-          vp9_encode_inter16x16y(x, mb_row, mb_col);

+          xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;

+          xd->mode_info_context->mbmi.ref_frame[1] = NONE;

+          vp9_build_inter_predictors_sby(xd, mb_row << 1,

+                                         mb_col << 1,

+                                         xd->mode_info_context->mbmi.sb_type);

+          vp9_encode_sby(cm, x, xd->mode_info_context->mbmi.sb_type);

           sum_mvr += mv.as_mv.row;

           sum_mvr_abs += abs(mv.as_mv.row);

           sum_mvc += mv.as_mv.col;

@@ -679,9 +735,9 @@

       coded_error += (int64_t)this_error;

       // adjust to the next column of macroblocks

-      x->src.y_buffer += 16;

-      x->src.u_buffer += 8;

-      x->src.v_buffer += 8;

+      x->plane[0].src.buf += 16;

+      x->plane[1].src.buf += 8;

+      x->plane[2].src.buf += 8;

       recon_yoffset += 16;

       recon_uvoffset += 8;

@@ -688,13 +744,10 @@

     // adjust to the next row of mbs

-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

-    x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

-    x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;

+    x->plane[1].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols;

+    x->plane[2].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols;

-    // extend the recon for intra prediction

-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,

-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);

     vp9_clear_system_state();  // __asm emms;

@@ -746,16 +799,14 @@

     // TODO:  handle the case when duration is set to 0, or something less

-    // than the full time between subsequent cpi->source_time_stamp s  .

+    // than the full time between subsequent values of cpi->source_time_stamp.

     fps.duration = (double)(cpi->source->ts_end

                             - cpi->source->ts_start);

     // don't want to do output stats with a stack variable!

-    memcpy(cpi->twopass.this_frame_stats,

-           &fps,

-           sizeof(FIRSTPASS_STATS));

-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);

-    accumulate_stats(cpi->twopass.total_stats, &fps);

+    cpi->twopass.this_frame_stats = fps;

+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);

+    accumulate_stats(&cpi->twopass.total_stats, &fps);

   // Copy the previous Last Frame back into gf and and arf buffers if

@@ -762,9 +813,9 @@

   // the prediction is good enough... but also dont allow it to lag too far

   if ((cpi->twopass.sr_update_lag > 3) ||

       ((cm->current_video_frame > 0) &&

-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&

-       ((cpi->twopass.this_frame_stats->intra_error /

-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats->coded_error)) >

+       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&

+       ((cpi->twopass.this_frame_stats.intra_error /

+         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >

         2.0))) {

     vp8_yv12_copy_frame(lst_yv12, gld_yv12);

     cpi->twopass.sr_update_lag = 1;

@@ -772,15 +823,14 @@

     cpi->twopass.sr_update_lag++;

   // swap frame pointers so last frame refers to the frame we just compressed

-  vp9_swap_yv12_buffer(lst_yv12, new_yv12);

-  vp8_yv12_extend_frame_borders(lst_yv12);

+  swap_yv12(lst_yv12, new_yv12);

+  vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);

   // Special case for the first frame. Copy into the GF buffer as a second reference.

-  if (cm->current_video_frame == 0) {

+  if (cm->current_video_frame == 0)

     vp8_yv12_copy_frame(lst_yv12, gld_yv12);

-  }

   // use this to see what the first pass reconstruction looks like

   if (0) {

     char filename[512];

@@ -849,38 +899,28 @@

                                      double err_divisor,

                                      double pt_low,

                                      double pt_high,

-                                     int Q) {

-  double power_term;

-  double error_term = err_per_mb / err_divisor;

-  double correction_factor;

+                                     int q) {

+  const double error_term = err_per_mb / err_divisor;

   // Adjustment based on actual quantizer to power term.

-  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;

-  power_term = (power_term > pt_high) ? pt_high : power_term;

+  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low,

+                                pt_high);

   // Calculate correction factor

   if (power_term < 1.0)

     assert(error_term >= 0.0);

-  correction_factor = pow(error_term, power_term);

-  // Clip range

-  correction_factor =

-    (correction_factor < 0.05)

-    ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;

-  return correction_factor;

+  return fclamp(pow(error_term, power_term), 0.05, 5.0);

 // Given a current maxQ value sets a range for future values.

 // PGW TODO..

-// This code removes direct dependency on QIndex to determin the range

+// This code removes direct dependency on QIndex to determine the range

 // (now uses the actual quantizer) but has not been tuned.

 static void adjust_maxq_qrange(VP9_COMP *cpi) {

   int i;

-  double q;

   // Set the max corresponding to cpi->avg_q * 2.0

-  q = cpi->avg_q * 2.0;

+  double q = cpi->avg_q * 2.0;

   cpi->twopass.maxq_max_limit = cpi->worst_quality;

   for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {

     cpi->twopass.maxq_max_limit = i;

@@ -901,12 +941,11 @@

 static int estimate_max_q(VP9_COMP *cpi,

                           FIRSTPASS_STATS *fpstats,

                           int section_target_bandwitdh) {

-  int Q;

+  int q;

   int num_mbs = cpi->common.MBs;

   int target_norm_bits_per_mb;

-  double section_err = (fpstats->coded_error / fpstats->count);

-  double sr_err_diff;

+  double section_err = fpstats->coded_error / fpstats->count;

   double sr_correction;

   double err_per_mb = section_err / num_mbs;

   double err_correction_factor;

@@ -915,23 +954,16 @@

   if (section_target_bandwitdh <= 0)

     return cpi->twopass.maxq_max_limit;          // Highest value allowed

-  target_norm_bits_per_mb =

-    (section_target_bandwitdh < (1 << 20))

-    ? (512 * section_target_bandwitdh) / num_mbs

-    : 512 * (section_target_bandwitdh / num_mbs);

+  target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)

+                              ? (512 * section_target_bandwitdh) / num_mbs

+                              : 512 * (section_target_bandwitdh / num_mbs);

   // Look at the drop in prediction quality between the last frame

   // and the GF buffer (which contained an older frame).

   if (fpstats->sr_coded_error > fpstats->coded_error) {

-    sr_err_diff =

-      (fpstats->sr_coded_error - fpstats->coded_error) /

-      (fpstats->count * cpi->common.MBs);

-    sr_correction = (sr_err_diff / 32.0);

-    sr_correction = pow(sr_correction, 0.25);

-    if (sr_correction < 0.75)

-      sr_correction = 0.75;

-    else if (sr_correction > 1.25)

-      sr_correction = 1.25;

+    double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /

+                             (fpstats->count * cpi->common.MBs);

+    sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);

   } else {

     sr_correction = 0.75;

@@ -938,69 +970,58 @@

   // Calculate a corrective factor based on a rolling ratio of bits spent

   // vs target bits

-  if ((cpi->rolling_target_bits > 0) &&

-      (cpi->active_worst_quality < cpi->worst_quality)) {

-    double rolling_ratio;

+  if (cpi->rolling_target_bits > 0 &&

+      cpi->active_worst_quality < cpi->worst_quality) {

+    double rolling_ratio = (double)cpi->rolling_actual_bits /

+                               (double)cpi->rolling_target_bits;

-    rolling_ratio = (double)cpi->rolling_actual_bits /

-                    (double)cpi->rolling_target_bits;

     if (rolling_ratio < 0.95)

       cpi->twopass.est_max_qcorrection_factor -= 0.005;

     else if (rolling_ratio > 1.05)

       cpi->twopass.est_max_qcorrection_factor += 0.005;

-    cpi->twopass.est_max_qcorrection_factor =

-      (cpi->twopass.est_max_qcorrection_factor < 0.1)

-      ? 0.1

-      : (cpi->twopass.est_max_qcorrection_factor > 10.0)

-      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;

+    cpi->twopass.est_max_qcorrection_factor = fclamp(

+        cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);

   // Corrections for higher compression speed settings

   // (reduced compression expected)

-  if (cpi->compressor_speed == 1) {

-    if (cpi->oxcf.cpu_used <= 5)

-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);

-    else

-      speed_correction = 1.25;

-  }

+  if (cpi->compressor_speed == 1)

+    speed_correction = cpi->oxcf.cpu_used <= 5 ?

+                          1.04 + (cpi->oxcf.cpu_used * 0.04) :

+                          1.25;

   // Try and pick a max Q that will be high enough to encode the

   // content at the given rate.

-  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {

+  for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) {

     int bits_per_mb_at_this_q;

-    err_correction_factor =

-      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *

-      sr_correction * speed_correction *

-      cpi->twopass.est_max_qcorrection_factor;

+    err_correction_factor = calc_correction_factor(err_per_mb,

+                                                   ERR_DIVISOR, 0.4, 0.90, q) *

+                                sr_correction * speed_correction *

+                                cpi->twopass.est_max_qcorrection_factor;

+    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,

+                                            err_correction_factor);

-    bits_per_mb_at_this_q =

-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);

     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

       break;

   // Restriction on active max q for constrained quality mode.

-  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

-      (Q < cpi->cq_target_quality)) {

-    Q = cpi->cq_target_quality;

-  }

+  if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&

+      q < cpi->cq_target_quality)

+    q = cpi->cq_target_quality;

   // Adjust maxq_min_limit and maxq_max_limit limits based on

-  // averaga q observed in clip for non kf/gf/arf frames

+  // average q observed in clip for non kf/gf/arf frames

   // Give average a chance to settle though.

   // PGW TODO.. This code is broken for the extended Q range

-  if ((cpi->ni_frames >

-       ((int)cpi->twopass.total_stats->count >> 8)) &&

-      (cpi->ni_frames > 25)) {

+  if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&

+      cpi->ni_frames > 25)

     adjust_maxq_qrange(cpi);

-  }

-  return Q;

+  return q;

 // For cq mode estimate a cq level that matches the observed

@@ -1008,7 +1029,7 @@

 static int estimate_cq(VP9_COMP *cpi,

                        FIRSTPASS_STATS *fpstats,

                        int section_target_bandwitdh) {

-  int Q;

+  int q;

   int num_mbs = cpi->common.MBs;

   int target_norm_bits_per_mb;

@@ -1052,23 +1073,23 @@

   // II ratio correction factor for clip as a whole

-  clip_iiratio = cpi->twopass.total_stats->intra_error /

-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);

+  clip_iiratio = cpi->twopass.total_stats.intra_error /

+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);

   clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);

   if (clip_iifactor < 0.80)

     clip_iifactor = 0.80;

   // Try and pick a Q that can encode the content at the given rate.

-  for (Q = 0; Q < MAXQ; Q++) {

+  for (q = 0; q < MAXQ; q++) {

     int bits_per_mb_at_this_q;

     // Error per MB based correction factor

     err_correction_factor =

-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *

+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *

       sr_correction * speed_correction * clip_iifactor;

     bits_per_mb_at_this_q =

-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);

+      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);

     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

       break;

@@ -1075,13 +1096,13 @@

   // Clip value to range "best allowed to (worst allowed - 1)"

-  Q = select_cq_level(Q);

-  if (Q >= cpi->worst_quality)

-    Q = cpi->worst_quality - 1;

-  if (Q < cpi->best_quality)

-    Q = cpi->best_quality;

+  q = select_cq_level(q);

+  if (q >= cpi->worst_quality)

+    q = cpi->worst_quality - 1;

+  if (q < cpi->best_quality)

+    q = cpi->best_quality;

-  return Q;

+  return q;

@@ -1098,14 +1119,14 @@

   if (two_pass_min_rate < lower_bounds_min_rate)

     two_pass_min_rate = lower_bounds_min_rate;

-  zero_stats(cpi->twopass.total_stats);

-  zero_stats(cpi->twopass.total_left_stats);

+  zero_stats(&cpi->twopass.total_stats);

+  zero_stats(&cpi->twopass.total_left_stats);

   if (!cpi->twopass.stats_in_end)

     return;

-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;

-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;

+  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;

+  cpi->twopass.total_left_stats = cpi->twopass.total_stats;

   // each frame can have a different duration, as the frame rate in the source

   // isn't guaranteed to be constant.   The frame rate prior to the first frame

@@ -1112,14 +1133,13 @@

   // encoded in the second pass is a guess.  However the sum duration is not.

   // Its calculated based on the actual durations of all frames from the first

   // pass.

-  vp9_new_frame_rate(cpi,

-                     10000000.0 * cpi->twopass.total_stats->count /

-                     cpi->twopass.total_stats->duration);

+  vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /

+                       cpi->twopass.total_stats.duration);

   cpi->output_frame_rate = cpi->oxcf.frame_rate;

-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *

+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *

                                      cpi->oxcf.target_bandwidth / 10000000.0);

-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *

+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *

                                       two_pass_min_rate / 10000000.0);

   // Calculate a minimum intra value to be used in determining the IIratio

@@ -1145,7 +1165,8 @@

       sum_iiratio += IIRatio;

-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);

+    cpi->twopass.avg_iiratio = sum_iiratio /

+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);

     // Reset file position

     reset_fpf_position(cpi, start_pos);

@@ -1185,9 +1206,8 @@

   // Look at the observed drop in prediction quality between the last frame

   // and the GF buffer (which contains an older frame).

-  mb_sr_err_diff =

-    (next_frame->sr_coded_error - next_frame->coded_error) /

-    (cpi->common.MBs);

+  mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) /

+                   cpi->common.MBs;

   if (mb_sr_err_diff <= 512.0) {

     second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);

     second_ref_decay = pow(second_ref_decay, 0.5);

@@ -1214,14 +1234,14 @@

   int still_interval,

   double loop_decay_rate,

   double last_decay_rate) {

-  int trans_to_still = FALSE;

+  int trans_to_still = 0;

   // Break clause to detect very still sections after motion

   // For example a static image after a fade or other transition

   // instead of a clean scene cut.

-  if ((frame_interval > MIN_GF_INTERVAL) &&

-      (loop_decay_rate >= 0.999) &&

-      (last_decay_rate < 0.9)) {

+  if (frame_interval > MIN_GF_INTERVAL &&

+      loop_decay_rate >= 0.999 &&

+      last_decay_rate < 0.9) {

     int j;

     FIRSTPASS_STATS *position = cpi->twopass.stats_in;

     FIRSTPASS_STATS tmp_next_frame;

@@ -1243,7 +1263,7 @@

     // Only if it does do we signal a transition to still

     if (j == still_interval)

-      trans_to_still = TRUE;

+      trans_to_still = 1;

   return trans_to_still;

@@ -1255,7 +1275,7 @@

 static int detect_flash(VP9_COMP *cpi, int offset) {

   FIRSTPASS_STATS next_frame;

-  int flash_detected = FALSE;

+  int flash_detected = 0;

   // Read the frame data.

   // The return is FALSE (no flash detected) if not a valid frame

@@ -1265,10 +1285,9 @@

     // are reasonably well predicted by an earlier (pre flash) frame.

     // The recovery after a flash is indicated by a high pcnt_second_ref

     // comapred to pcnt_inter.

-    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&

-        (next_frame.pcnt_second_ref >= 0.5)) {

-      flash_detected = TRUE;

-    }

+    if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&

+        next_frame.pcnt_second_ref >= 0.5)

+      flash_detected = 1;

   return flash_detected;

@@ -1350,13 +1369,9 @@

   return frame_boost;

-static int calc_arf_boost(

-  VP9_COMP *cpi,

-  int offset,

-  int f_frames,

-  int b_frames,

-  int *f_boost,

-  int *b_boost) {

+static int calc_arf_boost(VP9_COMP *cpi, int offset,

+                          int f_frames, int b_frames,

+                          int *f_boost, int *b_boost) {

   FIRSTPASS_STATS this_frame;

   int i;

@@ -1367,7 +1382,7 @@

   double mv_in_out_accumulator = 0.0;

   double abs_mv_in_out_accumulator = 0.0;

   int arf_boost;

-  int flash_detected = FALSE;

+  int flash_detected = 0;

   // Search forward from the proposed arf/next gf position

   for (i = 0; i < f_frames; i++) {

@@ -1379,7 +1394,7 @@

                                   &this_frame_mv_in_out, &mv_in_out_accumulator,

                                   &abs_mv_in_out_accumulator, &mv_ratio_accumulator);

-    // We want to discount the the flash frame itself and the recovery

+    // We want to discount the flash frame itself and the recovery

     // frame that follows as both will have poor scores.

     flash_detected = detect_flash(cpi, (i + offset)) ||

                      detect_flash(cpi, (i + offset + 1));

@@ -1386,8 +1401,7 @@

     // Cumulative effect of prediction quality decay

     if (!flash_detected) {

-      decay_accumulator =

-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);

+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);

       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR

                           ? MIN_DECAY_FACTOR : decay_accumulator;

@@ -1423,10 +1437,9 @@

     // Cumulative effect of prediction quality decay

     if (!flash_detected) {

-      decay_accumulator =

-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);

+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);

       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR

-                          ? MIN_DECAY_FACTOR : decay_accumulator;

+                              ? MIN_DECAY_FACTOR : decay_accumulator;

     boost_score += (decay_accumulator *

@@ -1442,80 +1455,144 @@

   return arf_boost;

-static void configure_arnr_filter(VP9_COMP *cpi,

-                                  FIRSTPASS_STATS *this_frame,

-                                  int group_boost) {

-  int half_gf_int;

-  int frames_after_arf;

-  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;

-  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;

-  int q;

+#if CONFIG_MULTIPLE_ARF

+// Work out the frame coding order for a GF or an ARF group.

+// The current implementation codes frames in their natural order for a

+// GF group, and inserts additional ARFs into an ARF group using a

+// binary split approach.

+// NOTE: this function is currently implemented recursively.

+static void schedule_frames(VP9_COMP *cpi, const int start, const int end,

+                            const int arf_idx, const int gf_or_arf_group,

+                            const int level) {

+  int i, abs_end, half_range;

+  int *cfo = cpi->frame_coding_order;

+  int idx = cpi->new_frame_coding_order_period;

-  // Define the arnr filter width for this group of frames:

-  // We only filter frames that lie within a distance of half

-  // the GF interval from the ARF frame. We also have to trap

-  // cases where the filter extends beyond the end of clip.

-  // Note: this_frame->frame has been updated in the loop

-  // so it now points at the ARF frame.

-  half_gf_int = cpi->baseline_gf_interval >> 1;

-  frames_after_arf = (int)(cpi->twopass.total_stats->count -

-                           this_frame->frame - 1);

+  // If (end < 0) an ARF should be coded at position (-end).

+  assert(start >= 0);

-  switch (cpi->oxcf.arnr_type) {

-    case 1: // Backward filter

-      frames_fwd = 0;

-      if (frames_bwd > half_gf_int)

-        frames_bwd = half_gf_int;

-      break;

+  // printf("start:%d end:%d\n", start, end);

-    case 2: // Forward filter

-      if (frames_fwd > half_gf_int)

-        frames_fwd = half_gf_int;

-      if (frames_fwd > frames_after_arf)

-        frames_fwd = frames_after_arf;

-      frames_bwd = 0;

-      break;

+  // GF Group: code frames in logical order.

+  if (gf_or_arf_group == 0) {

+    assert(end >= start);

+    for (i = start; i <= end; ++i) {

+      cfo[idx] = i;

+      cpi->arf_buffer_idx[idx] = arf_idx;

+      cpi->arf_weight[idx] = -1;

+      ++idx;

+    }

+    cpi->new_frame_coding_order_period = idx;

+    return;

+  }

-    case 3: // Centered filter

-    default:

-      frames_fwd >>= 1;

-      if (frames_fwd > frames_after_arf)

-        frames_fwd = frames_after_arf;

-      if (frames_fwd > half_gf_int)

-        frames_fwd = half_gf_int;

+  // ARF Group: work out the ARF schedule.

+  // Mark ARF frames as negative.

+  if (end < 0) {

+    // printf("start:%d end:%d\n", -end, -end);

+    // ARF frame is at the end of the range.

+    cfo[idx] = end;

+    // What ARF buffer does this ARF use as predictor.

+    cpi->arf_buffer_idx[idx] = (arf_idx > 2) ? (arf_idx - 1) : 2;

+    cpi->arf_weight[idx] = level;

+    ++idx;

+    abs_end = -end;

+  } else {

+    abs_end = end;

+  }

-      frames_bwd = frames_fwd;

+  half_range = (abs_end - start) >> 1;

-      // For even length filter there is one more frame backward

-      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.

-      if (frames_bwd < half_gf_int)

-        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;

-      break;

+  // ARFs may not be adjacent, they must be separated by at least

+  // MIN_GF_INTERVAL non-ARF frames.

+  if ((start + MIN_GF_INTERVAL) >= (abs_end - MIN_GF_INTERVAL)) {

+    // printf("start:%d end:%d\n", start, abs_end);

+    // Update the coding order and active ARF.

+    for (i = start; i <= abs_end; ++i) {

+      cfo[idx] = i;

+      cpi->arf_buffer_idx[idx] = arf_idx;

+      cpi->arf_weight[idx] = -1;

+      ++idx;

+    }

+    cpi->new_frame_coding_order_period = idx;

+  } else {

+    // Place a new ARF at the mid-point of the range.

+    cpi->new_frame_coding_order_period = idx;

+    schedule_frames(cpi, start, -(start + half_range), arf_idx + 1,

+                    gf_or_arf_group, level + 1);

+    schedule_frames(cpi, start + half_range + 1, abs_end, arf_idx,

+                    gf_or_arf_group, level + 1);

+}

-  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;

+#define FIXED_ARF_GROUP_SIZE 16

-  // Adjust the strength based on active max q

-  q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1);

-  if (q > 8) {

-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength;

+void define_fixed_arf_period(VP9_COMP *cpi) {

+  int i;

+  int max_level = INT_MIN;

+  assert(cpi->multi_arf_enabled);

+  assert(cpi->oxcf.lag_in_frames >= FIXED_ARF_GROUP_SIZE);

+  // Save the weight of the last frame in the sequence before next

+  // sequence pattern overwrites it.

+  cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];

+  assert(cpi->this_frame_weight >= 0);

+  // Initialize frame coding order variables.

+  cpi->new_frame_coding_order_period = 0;

+  cpi->next_frame_in_order = 0;

+  cpi->arf_buffered = 0;

+  vp9_zero(cpi->frame_coding_order);

+  vp9_zero(cpi->arf_buffer_idx);

+  vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));

+  if (cpi->twopass.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {

+    // Setup a GF group close to the keyframe.

+    cpi->source_alt_ref_pending = 0;

+    cpi->baseline_gf_interval = cpi->twopass.frames_to_key;

+    schedule_frames(cpi, 0, (cpi->baseline_gf_interval - 1), 2, 0, 0);

   } else {

-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q);

-    if (cpi->active_arnr_strength < 0)

-      cpi->active_arnr_strength = 0;

+    // Setup a fixed period ARF group.

+    cpi->source_alt_ref_pending = 1;

+    cpi->baseline_gf_interval = FIXED_ARF_GROUP_SIZE;

+    schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);

-  // Adjust number of frames in filter and strength based on gf boost level.

-  if (cpi->active_arnr_frames > (group_boost / 150)) {

-    cpi->active_arnr_frames = (group_boost / 150);

-    cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);

+  // Replace level indicator of -1 with correct level.

+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+    if (cpi->arf_weight[i] > max_level) {

+      max_level = cpi->arf_weight[i];

+    }

-  if (cpi->active_arnr_strength > (group_boost / 300)) {

-    cpi->active_arnr_strength = (group_boost / 300);

+  ++max_level;

+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+    if (cpi->arf_weight[i] == -1) {

+      cpi->arf_weight[i] = max_level;

+    }

+  cpi->max_arf_level = max_level;

+#if 0

+  printf("\nSchedule: ");

+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+    printf("%4d ", cpi->frame_coding_order[i]);

+  }

+  printf("\n");

+  printf("ARFref:   ");

+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+    printf("%4d ", cpi->arf_buffer_idx[i]);

+  }

+  printf("\n");

+  printf("Weight:   ");

+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+    printf("%4d ", cpi->arf_weight[i]);

+  }

+  printf("\n");

+#endif

+#endif

-// Analyse and define a gf/arf group .

+// Analyse and define a gf/arf group.

 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

   FIRSTPASS_STATS next_frame;

   FIRSTPASS_STATS *start_pos;

@@ -1619,10 +1696,10 @@

       // Break clause to detect very still sections after motion

-      // (for example a staic image after a fade or other transition).

+      // (for example a static image after a fade or other transition).

       if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,

                                      last_loop_decay_rate)) {

-        allow_alt_ref = FALSE;

+        allow_alt_ref = 0;

         break;

@@ -1637,9 +1714,9 @@

       // Break at cpi->max_gf_interval unless almost totally static

       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||

-        // Dont break out with a very short interval

+        // Don't break out with a very short interval

         (i > MIN_GF_INTERVAL) &&

-        // Dont break out very close to a key frame

+        // Don't break out very close to a key frame

         ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&

         ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&

         (!flash_detected) &&

@@ -1652,12 +1729,12 @@

       break;

-    vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));

+    *this_frame = next_frame;

     old_boost_score = boost_score;

-  // Dont allow a gf too near the next kf

+  // Don't allow a gf too near the next kf

   if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {

     while (i < cpi->twopass.frames_to_key) {

       i++;

@@ -1672,10 +1749,22 @@

-  // Set the interval till the next gf or arf.

+  // Set the interval until the next gf or arf.

   cpi->baseline_gf_interval = i;

-  // Should we use the alternate refernce frame

+#if CONFIG_MULTIPLE_ARF

+  if (cpi->multi_arf_enabled) {

+    // Initialize frame coding order variables.

+    cpi->new_frame_coding_order_period = 0;

+    cpi->next_frame_in_order = 0;

+    cpi->arf_buffered = 0;

+    vp9_zero(cpi->frame_coding_order);

+    vp9_zero(cpi->arf_buffer_idx);

+    vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));

+  }

+#endif

+  // Should we use the alternate reference frame

   if (allow_alt_ref &&

       (i < cpi->oxcf.lag_in_frames) &&

       (i >= MIN_GF_INTERVAL) &&

@@ -1686,16 +1775,66 @@

       ((mv_in_out_accumulator / (double)i > -0.2) ||

        (mv_in_out_accumulator > -2.0)) &&

       (boost_score > 100)) {

-    // Alterrnative boost calculation for alt ref

+    // Alternative boost calculation for alt ref

     cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);

-    cpi->source_alt_ref_pending = TRUE;

+    cpi->source_alt_ref_pending = 1;

-    configure_arnr_filter(cpi, this_frame, cpi->gfu_boost);

+#if CONFIG_MULTIPLE_ARF

+    // Set the ARF schedule.

+    if (cpi->multi_arf_enabled) {

+      schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);

+    }

+#endif

   } else {

     cpi->gfu_boost = (int)boost_score;

-    cpi->source_alt_ref_pending = FALSE;

+    cpi->source_alt_ref_pending = 0;

+#if CONFIG_MULTIPLE_ARF

+    // Set the GF schedule.

+    if (cpi->multi_arf_enabled) {

+      schedule_frames(cpi, 0, cpi->baseline_gf_interval - 1, 2, 0, 0);

+      assert(cpi->new_frame_coding_order_period == cpi->baseline_gf_interval);

+    }

+#endif

+#if CONFIG_MULTIPLE_ARF

+  if (cpi->multi_arf_enabled && (cpi->common.frame_type != KEY_FRAME)) {

+    int max_level = INT_MIN;

+    // Replace level indicator of -1 with correct level.

+    for (i = 0; i < cpi->frame_coding_order_period; ++i) {

+      if (cpi->arf_weight[i] > max_level) {

+        max_level = cpi->arf_weight[i];

+      }

+    }

+    ++max_level;

+    for (i = 0; i < cpi->frame_coding_order_period; ++i) {

+      if (cpi->arf_weight[i] == -1) {

+        cpi->arf_weight[i] = max_level;

+      }

+    }

+    cpi->max_arf_level = max_level;

+  }

+#if 0

+  if (cpi->multi_arf_enabled) {

+    printf("\nSchedule: ");

+    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+      printf("%4d ", cpi->frame_coding_order[i]);

+    }

+    printf("\n");

+    printf("ARFref:   ");

+    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+      printf("%4d ", cpi->arf_buffer_idx[i]);

+    }

+    printf("\n");

+    printf("Weight:   ");

+    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {

+      printf("%4d ", cpi->arf_weight[i]);

+    }

+    printf("\n");

+  }

+#endif

+#endif

   // Now decide how many bits should be allocated to the GF group as  a

   // proportion of those remaining in the kf group.

   // The final key frame group in the clip is treated as a special case

@@ -1702,7 +1841,7 @@

   // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.

   // This is also important for short clips where there may only be one

   // key frame.

-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -

+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -

                                           cpi->common.current_video_frame)) {

     cpi->twopass.kf_group_bits =

       (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;

@@ -1736,29 +1875,26 @@

   cpi->twopass.modified_error_used += gf_group_err;

   // Assign  bits to the arf or gf.

-  for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {

-    int boost;

+  for (i = 0;

+      i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);

+      ++i) {

     int allocation_chunks;

-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;

+    int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]

+                                  : cpi->oxcf.fixed_q;

     int gf_bits;

-    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;

+    int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;

     // Set max and minimum boost and hence minimum allocation

-    if (boost > ((cpi->baseline_gf_interval + 1) * 200))

-      boost = ((cpi->baseline_gf_interval + 1) * 200);

-    else if (boost < 125)

-      boost = 125;

+    boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);

     if (cpi->source_alt_ref_pending && i == 0)

-      allocation_chunks =

-        ((cpi->baseline_gf_interval + 1) * 100) + boost;

+      allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;

     else

-      allocation_chunks =

-        (cpi->baseline_gf_interval * 100) + (boost - 100);

+      allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);

     // Prevent overflow

-    if (boost > 1028) {

+    if (boost > 1023) {

       int divisor = boost >> 10;

       boost /= divisor;

       allocation_chunks /= divisor;

@@ -1766,41 +1902,34 @@

     // Calculate the number of bits to be spent on the gf or arf based on

     // the boost number

-    gf_bits = (int)((double)boost *

-                    (cpi->twopass.gf_group_bits /

-                     (double)allocation_chunks));

+    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /

+                                       (double)allocation_chunks));

     // If the frame that is to be boosted is simpler than the average for

     // the gf/arf group then use an alternative calculation

     // based on the error score of the frame itself

     if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {

-      double  alt_gf_grp_bits;

-      int     alt_gf_bits;

-      alt_gf_grp_bits =

+      double alt_gf_grp_bits =

         (double)cpi->twopass.kf_group_bits  *

         (mod_frame_err * (double)cpi->baseline_gf_interval) /

         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);

-      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /

+      int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /

                                            (double)allocation_chunks));

-      if (gf_bits > alt_gf_bits) {

+      if (gf_bits > alt_gf_bits)

         gf_bits = alt_gf_bits;

-      }

     // Else if it is harder than other frames in the group make sure it at

     // least receives an allocation in keeping with its relative error

     // score, otherwise it may be worse off than an "un-boosted" frame

     else {

-      int alt_gf_bits =

-        (int)((double)cpi->twopass.kf_group_bits *

-              mod_frame_err /

-              DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));

+      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *

+                        mod_frame_err /

+                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));

-      if (alt_gf_bits > gf_bits) {

+      if (alt_gf_bits > gf_bits)

         gf_bits = alt_gf_bits;

-      }

     // Dont allow a negative value for gf_bits

@@ -1807,18 +1936,21 @@

     if (gf_bits < 0)

       gf_bits = 0;

-    gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame

+    // Add in minimum for a frame

+    gf_bits += cpi->min_frame_bandwidth;

     if (i == 0) {

       cpi->twopass.gf_bits = gf_bits;

-    if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {

-      cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame

+    if (i == 1 || (!cpi->source_alt_ref_pending

+        && (cpi->common.frame_type != KEY_FRAME))) {

+      // Per frame bit target for this frame

+      cpi->per_frame_bandwidth = gf_bits;

-    // Adjust KF group bits and error remainin

+    // Adjust KF group bits and error remaining

     cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;

     cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;

@@ -1835,33 +1967,27 @@

     else

       cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;

-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;

+    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits

+        - cpi->min_frame_bandwidth;

     if (cpi->twopass.gf_group_bits < 0)

       cpi->twopass.gf_group_bits = 0;

     // This condition could fail if there are two kfs very close together

-    // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the

-    // calculation of cpi->twopass.alt_extra_bits.

+    // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the

+    // calculation of alt_extra_bits.

     if (cpi->baseline_gf_interval >= 3) {

-      int boost = (cpi->source_alt_ref_pending)

-                  ? b_boost : cpi->gfu_boost;

+      const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;

       if (boost >= 150) {

-        int pct_extra;

-        pct_extra = (boost - 100) / 50;

+        int alt_extra_bits;

+        int pct_extra = (boost - 100) / 50;

         pct_extra = (pct_extra > 20) ? 20 : pct_extra;

-        cpi->twopass.alt_extra_bits = (int)

-          ((cpi->twopass.gf_group_bits * pct_extra) / 100);

-        cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;

-        cpi->twopass.alt_extra_bits /=

-          ((cpi->baseline_gf_interval - 1) >> 1);

-      } else

-        cpi->twopass.alt_extra_bits = 0;

-    } else

-      cpi->twopass.alt_extra_bits = 0;

+        alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);

+        cpi->twopass.gf_group_bits -= alt_extra_bits;

+      }

+    }

   if (cpi->common.frame_type != KEY_FRAME) {

@@ -1887,24 +2013,28 @@

 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.

 static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

-  int    target_frame_size;                                                             // gf_group_error_left

+  int target_frame_size;

   double modified_err;

-  double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame

+  double err_fraction;

-  int max_bits = frame_max_bits(cpi);    // Max for a single frame

+  // Max for a single frame.

+  int max_bits = frame_max_bits(cpi);

-  // Calculate modified prediction error used in bit allocation

+  // Calculate modified prediction error used in bit allocation.

   modified_err = calculate_modified_err(cpi, this_frame);

   if (cpi->twopass.gf_group_error_left > 0)

-    err_fraction = modified_err / cpi->twopass.gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame

+    // What portion of the remaining GF group error is used by this frame.

+    err_fraction = modified_err / cpi->twopass.gf_group_error_left;

   else

     err_fraction = 0.0;

-  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?

+  // How many of those bits available for allocation should we give it?

+  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);

-  // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.

+  // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at

+  // the top end.

   if (target_frame_size < 0)

     target_frame_size = 0;

   else {

@@ -1915,54 +2045,43 @@

       target_frame_size = (int)cpi->twopass.gf_group_bits;

-  // Adjust error remaining

+  // Adjust error and bits remaining.

   cpi->twopass.gf_group_error_left -= (int64_t)modified_err;

-  cpi->twopass.gf_group_bits -= target_frame_size;                                                // Adjust bits remaining

+  cpi->twopass.gf_group_bits -= target_frame_size;

   if (cpi->twopass.gf_group_bits < 0)

     cpi->twopass.gf_group_bits = 0;

-  target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.

+  // Add in the minimum number of bits that is set aside for every frame.

+  target_frame_size += cpi->min_frame_bandwidth;

-  cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame

+  // Per frame bit target for this frame.

+  cpi->per_frame_bandwidth = target_frame_size;

 // Make a damped adjustment to the active max q.

 static int adjust_active_maxq(int old_maxqi, int new_maxqi) {

   int i;

-  int ret_val = new_maxqi;

-  double old_q;

-  double new_q;

-  double target_q;

+  const double old_q = vp9_convert_qindex_to_q(old_maxqi);

+  const double new_q = vp9_convert_qindex_to_q(new_maxqi);

+  const double target_q = ((old_q * 7.0) + new_q) / 8.0;

-  old_q = vp9_convert_qindex_to_q(old_maxqi);

-  new_q = vp9_convert_qindex_to_q(new_maxqi);

-  target_q = ((old_q * 7.0) + new_q) / 8.0;

   if (target_q > old_q) {

-    for (i = old_maxqi; i <= new_maxqi; i++) {

-      if (vp9_convert_qindex_to_q(i) >= target_q) {

-        ret_val = i;

-        break;

-      }

-    }

+    for (i = old_maxqi; i <= new_maxqi; i++)

+      if (vp9_convert_qindex_to_q(i) >= target_q)

+        return i;

   } else {

-    for (i = old_maxqi; i >= new_maxqi; i--) {

-      if (vp9_convert_qindex_to_q(i) <= target_q) {

-        ret_val = i;

-        break;

-      }

-    }

+    for (i = old_maxqi; i >= new_maxqi; i--)

+      if (vp9_convert_qindex_to_q(i) <= target_q)

+        return i;

-  return ret_val;

+  return new_maxqi;

 void vp9_second_pass(VP9_COMP *cpi) {

   int tmp_q;

-  int frames_left = (int)(cpi->twopass.total_stats->count -

+  int frames_left = (int)(cpi->twopass.total_stats.count -

                           cpi->common.current_video_frame);

   FIRSTPASS_STATS this_frame;

@@ -1971,9 +2090,8 @@

   double this_frame_intra_error;

   double this_frame_coded_error;

-  if (!cpi->twopass.stats_in) {

+  if (!cpi->twopass.stats_in)

     return;

-  }

   vp9_clear_system_state();

@@ -1983,13 +2101,9 @@

     // Set a cq_level in constrained quality mode.

     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

-      int est_cq;

+      int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,

+                               (int)(cpi->twopass.bits_left / frames_left));

-      est_cq =

-        estimate_cq(cpi,

-                    cpi->twopass.total_left_stats,

-                    (int)(cpi->twopass.bits_left / frames_left));

       cpi->cq_target_quality = cpi->oxcf.cq_level;

       if (est_cq > cpi->cq_target_quality)

         cpi->cq_target_quality = est_cq;

@@ -1999,14 +2113,12 @@

     cpi->twopass.maxq_max_limit = cpi->worst_quality;

     cpi->twopass.maxq_min_limit = cpi->best_quality;

-    tmp_q = estimate_max_q(

-              cpi,

-              cpi->twopass.total_left_stats,

-              (int)(cpi->twopass.bits_left / frames_left));

+    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,

+                           (int)(cpi->twopass.bits_left / frames_left));

-    cpi->active_worst_quality         = tmp_q;

-    cpi->ni_av_qi                     = tmp_q;

-    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);

+    cpi->active_worst_quality = tmp_q;

+    cpi->ni_av_qi = tmp_q;

+    cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);

 #ifndef ONE_SHOT_Q_ESTIMATE

     // Limit the maxq value returned subsequently.

@@ -2024,15 +2136,15 @@

   // radical adjustments to the allowed quantizer range just to use up a

   // few surplus bits or get beneath the target rate.

   else if ((cpi->common.current_video_frame <

-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&

+            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&

            ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <

-            (unsigned int)cpi->twopass.total_stats->count)) {

+            (unsigned int)cpi->twopass.total_stats.count)) {

     if (frames_left < 1)

       frames_left = 1;

     tmp_q = estimate_max_q(

               cpi,

-              cpi->twopass.total_left_stats,

+              &cpi->twopass.total_left_stats,

               (int)(cpi->twopass.bits_left / frames_left));

     // Make a damped adjustment to active max Q

@@ -2051,7 +2163,7 @@

   // keyframe and section processing !

   if (cpi->twopass.frames_to_key == 0) {

     // Define next KF group and assign bits to it

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    this_frame_copy = this_frame;

     find_next_key_frame(cpi, &this_frame_copy);

@@ -2058,9 +2170,18 @@

   // Is this a GF / ARF (Note that a KF is always also a GF)

   if (cpi->frames_till_gf_update_due == 0) {

     // Define next gf group and assign bits to it

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-    define_gf_group(cpi, &this_frame_copy);

+    this_frame_copy = this_frame;

+#if CONFIG_MULTIPLE_ARF

+    if (cpi->multi_arf_enabled) {

+      define_fixed_arf_period(cpi);

+    } else {

+#endif

+      define_gf_group(cpi, &this_frame_copy);

+#if CONFIG_MULTIPLE_ARF

+    }

+#endif

     // If we are going to code an altref frame at the end of the group

     // and the current frame is not a key frame....

     // If the previous group used an arf this frame has already benefited

@@ -2071,7 +2192,7 @@

       // Assign a standard frames worth of bits from those allocated

       // to the GF group

       int bak = cpi->per_frame_bandwidth;

-      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+      this_frame_copy = this_frame;

       assign_std_frame_bits(cpi, &this_frame_copy);

       cpi->per_frame_bandwidth = bak;

@@ -2078,7 +2199,7 @@

   } else {

     // Otherwise this is an ordinary frame

     // Assign bits from those allocated to the GF group

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    this_frame_copy =  this_frame;

     assign_std_frame_bits(cpi, &this_frame_copy);

@@ -2101,8 +2222,8 @@

   cpi->twopass.frames_to_key--;

-  // Update the total stats remaining sturcture

-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);

+  // Update the total stats remaining structure

+  subtract_stats(&cpi->twopass.total_left_stats, &this_frame);

 static int test_candidate_kf(VP9_COMP *cpi,

@@ -2109,7 +2230,7 @@

                              FIRSTPASS_STATS *last_frame,

                              FIRSTPASS_STATS *this_frame,

                              FIRSTPASS_STATS *next_frame) {

-  int is_viable_kf = FALSE;

+  int is_viable_kf = 0;

   // Does the frame satisfy the primary criteria of a key frame

   //      If so, then examine how well it predicts subsequent frames

@@ -2136,7 +2257,7 @@

     double decay_accumulator = 1.0;

     double next_iiratio;

-    vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));

+    local_next_frame = *next_frame;

     // Note the starting file position so we can reset to it

     start_pos = cpi->twopass.stats_in;

@@ -2178,14 +2299,15 @@

         break;

-    // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on

+    // If there is tolerable prediction for at least the next 3 frames then

+    // break out else discard this potential key frame and move on

     if (boost_score > 30.0 && (i > 3))

-      is_viable_kf = TRUE;

+      is_viable_kf = 1;

     else {

       // Reset the file position

       reset_fpf_position(cpi, start_pos);

-      is_viable_kf = FALSE;

+      is_viable_kf = 0;

@@ -2201,7 +2323,6 @@

   double decay_accumulator = 1.0;

   double zero_motion_accumulator = 1.0;

   double boost_score = 0;

-  double old_boost_score = 0.0;

   double loop_decay_rate;

   double kf_mod_err = 0.0;

@@ -2221,7 +2342,7 @@

   cpi->this_key_frame_forced = cpi->next_key_frame_forced;

   // Clear the alt ref active flag as this can never be active on a key frame

-  cpi->source_alt_ref_active = FALSE;

+  cpi->source_alt_ref_active = 0;

   // Kf is always a gf so clear frames till next gf counter

   cpi->frames_till_gf_update_due = 0;

@@ -2229,9 +2350,9 @@

   cpi->twopass.frames_to_key = 1;

   // Take a copy of the initial frame details

-  vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));

+  first_frame = *this_frame;

-  cpi->twopass.kf_group_bits = 0;        // Total bits avaialable to kf group

+  cpi->twopass.kf_group_bits = 0;        // Total bits available to kf group

   cpi->twopass.kf_group_error_left = 0;  // Group modified error score.

   kf_mod_err = calculate_modified_err(cpi, this_frame);

@@ -2248,7 +2369,7 @@

     kf_group_coded_err += this_frame->coded_error;

     // load a the next frame's stats

-    vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));

+    last_frame = *this_frame;

     input_stats(cpi, this_frame);

     // Provided that we are not at the end of the file...

@@ -2255,10 +2376,10 @@

     if (cpi->oxcf.auto_key

         && lookup_next_frame_stats(cpi, &next_frame) != EOF) {

       // Normal scene cut check

-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {

+      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))

         break;

-      }

       // How fast is prediction quality decaying

       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

@@ -2267,20 +2388,15 @@

       // quality since the last GF or KF.

       recent_loop_decay[i % 8] = loop_decay_rate;

       decay_accumulator = 1.0;

-      for (j = 0; j < 8; j++) {

-        decay_accumulator = decay_accumulator * recent_loop_decay[j];

-      }

+      for (j = 0; j < 8; j++)

+        decay_accumulator *= recent_loop_decay[j];

       // Special check for transition or high motion followed by a

       // to a static scene.

-      if (detect_transition_to_still(cpi, i,

-                                     (cpi->key_frame_frequency - i),

-                                     loop_decay_rate,

-                                     decay_accumulator)) {

+      if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,

+                                     loop_decay_rate, decay_accumulator))

         break;

-      }

       // Step on to the next frame

       cpi->twopass.frames_to_key++;

@@ -2306,7 +2422,7 @@

     cpi->twopass.frames_to_key /= 2;

     // Copy first frame details

-    vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));

+    tmp_frame = first_frame;

     // Reset to the start of the group

     reset_fpf_position(cpi, start_position);

@@ -2329,9 +2445,9 @@

     // Reset to the start of the group

     reset_fpf_position(cpi, current_pos);

-    cpi->next_key_frame_forced = TRUE;

+    cpi->next_key_frame_forced = 1;

   } else

-    cpi->next_key_frame_forced = FALSE;

+    cpi->next_key_frame_forced = 0;

   // Special case for the last frame of the file

   if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {

@@ -2373,6 +2489,7 @@

   boost_score = 0.0;

   loop_decay_rate = 1.00;       // Starting decay rate

+  // Scan through the kf group collating various stats.

   for (i = 0; i < cpi->twopass.frames_to_key; i++) {

     double r;

@@ -2379,16 +2496,6 @@

     if (EOF == input_stats(cpi, &next_frame))

       break;

-    if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)

-      r = (IIKFACTOR2 * next_frame.intra_error /

-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-    else

-      r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /

-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-    if (r > RMAX)

-      r = RMAX;

     // Monitor for static sections.

     if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <

         zero_motion_accumulator) {

@@ -2396,22 +2503,28 @@

         (next_frame.pcnt_inter - next_frame.pcnt_motion);

-    // How fast is prediction quality decaying

-    if (!detect_flash(cpi, 0)) {

-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

-      decay_accumulator = decay_accumulator * loop_decay_rate;

-      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR

-                            ? MIN_DECAY_FACTOR : decay_accumulator;

-    }

+    // For the first few frames collect data to decide kf boost.

+    if (i <= (cpi->max_gf_interval * 2)) {

+      if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)

+        r = (IIKFACTOR2 * next_frame.intra_error /

+             DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

+      else

+        r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /

+             DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-    boost_score += (decay_accumulator * r);

+      if (r > RMAX)

+        r = RMAX;

-    if ((i > MIN_GF_INTERVAL) &&

-        ((boost_score - old_boost_score) < 6.25)) {

-      break;

-    }

+      // How fast is prediction quality decaying

+      if (!detect_flash(cpi, 0)) {

+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

+        decay_accumulator = decay_accumulator * loop_decay_rate;

+        decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR

+                              ? MIN_DECAY_FACTOR : decay_accumulator;

+      }

-    old_boost_score = boost_score;

+      boost_score += (decay_accumulator * r);

+    }

@@ -2441,8 +2554,8 @@

     int allocation_chunks;

     int alt_kf_bits;

-    if (kf_boost < (cpi->twopass.frames_to_key * 5))

-      kf_boost = (cpi->twopass.frames_to_key * 5);

+    if (kf_boost < (cpi->twopass.frames_to_key * 3))

+      kf_boost = (cpi->twopass.frames_to_key * 3);

     if (kf_boost < 300) // Min KF boost

       kf_boost = 300;

--- a/vp9/encoder/vp9_lookahead.c

+++ b/vp9/encoder/vp9_lookahead.c

@@ -46,7 +46,7 @@

       unsigned int i;

       for (i = 0; i < ctx->max_sz; i++)

-        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);

+        vp9_free_frame_buffer(&ctx->buf[i].img);

       free(ctx->buf);

     free(ctx);

@@ -56,6 +56,8 @@

 struct lookahead_ctx * vp9_lookahead_init(unsigned int width,

                                           unsigned int height,

+                                          unsigned int subsampling_x,

+                                          unsigned int subsampling_y,

                                           unsigned int depth) {

   struct lookahead_ctx *ctx = NULL;

@@ -71,8 +73,9 @@

     if (!ctx->buf)

       goto bail;

     for (i = 0; i < depth; i++)

-      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,

-                                      width, height, VP9BORDERINPIXELS))

+      if (vp9_alloc_frame_buffer(&ctx->buf[i].img,

+                                 width, height, subsampling_x, subsampling_y,

+                                 VP9BORDERINPIXELS))

         goto bail;

   return ctx;

@@ -81,14 +84,17 @@

   return NULL;

+#define USE_PARTIAL_COPY 0

 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,

                        int64_t ts_start, int64_t ts_end, unsigned int flags,

                        unsigned char *active_map) {

   struct lookahead_entry *buf;

+#if USE_PARTIAL_COPY

   int row, col, active_end;

   int mb_rows = (src->y_height + 15) >> 4;

   int mb_cols = (src->y_width + 15) >> 4;

+#endif

   if (ctx->sz + 1 > ctx->max_sz)

     return 1;

@@ -95,6 +101,10 @@

   ctx->sz++;

   buf = pop(ctx, &ctx->write_idx);

+#if USE_PARTIAL_COPY

+  // TODO(jkoleszar): This is disabled for now, as

+  // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.

   // Only do this partial copy if the following conditions are all met:

   // 1. Lookahead queue has has size of 1.

   // 2. Active map is provided.

@@ -137,6 +147,11 @@

   } else {

     vp9_copy_and_extend_frame(src, &buf->img);

+#else

+  // Partial copy not implemented yet

+  vp9_copy_and_extend_frame(src, &buf->img);

+#endif

   buf->ts_start = ts_start;

   buf->ts_end = ts_end;

   buf->flags = flags;

--- a/vp9/encoder/vp9_lookahead.h

+++ b/vp9/encoder/vp9_lookahead.h

@@ -31,6 +31,8 @@

*/

 struct lookahead_ctx *vp9_lookahead_init(unsigned int width,

                                          unsigned int height,

+                                         unsigned int subsampling_x,

+                                         unsigned int subsampling_y,

                                          unsigned int depth);

--- a/vp9/encoder/vp9_mbgraph.c

+++ b/vp9/encoder/vp9_mbgraph.c

@@ -9,13 +9,13 @@

*/

 #include <limits.h>

+#include <vpx_mem/vpx_mem.h>

 #include <vp9/encoder/vp9_encodeintra.h>

 #include <vp9/encoder/vp9_rdopt.h>

-#include <vp9/common/vp9_setupintrarecon.h>

 #include <vp9/common/vp9_blockd.h>

 #include <vp9/common/vp9_reconinter.h>

 #include <vp9/common/vp9_systemdependent.h>

-#include <vpx_mem/vpx_mem.h>

 #include <vp9/encoder/vp9_segmentation.h>

 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,

@@ -25,21 +25,18 @@

                                               int mb_col) {

   MACROBLOCK   *const x  = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  BLOCK *b  = &x->block[0];

-  BLOCKD *d = &xd->block[0];

   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

   unsigned int best_err;

-  int tmp_col_min = x->mv_col_min;

-  int tmp_col_max = x->mv_col_max;

-  int tmp_row_min = x->mv_row_min;

-  int tmp_row_max = x->mv_row_max;

+  const int tmp_col_min = x->mv_col_min;

+  const int tmp_col_max = x->mv_col_max;

+  const int tmp_row_min = x->mv_row_min;

+  const int tmp_row_max = x->mv_row_max;

   int_mv ref_full;

   // Further step/diamond searches as necessary

   int step_param = cpi->sf.first_step +

-      (cpi->Speed < 8 ? (cpi->Speed > 5 ? 1 : 0) : 2);

+      (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);

   vp9_clamp_mv_min_max(x, ref_mv);

@@ -47,15 +44,8 @@

   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;

   /*cpi->sf.search_method == HEX*/

-  best_err = vp9_hex_search(

-      x, b, d,

-      &ref_full, dst_mv,

-      step_param,

-      x->errorperbit,

-      &v_fn_ptr,

-      NULL, NULL,

-      NULL, NULL,

-      ref_mv);

+  best_err = vp9_hex_search(x, &ref_full, dst_mv, step_param, x->errorperbit,

+                            &v_fn_ptr, NULL, NULL, NULL, NULL, ref_mv);

   // Try sub-pixel MC

   // if (bestsme > error_thresh && bestsme < INT_MAX)

@@ -63,7 +53,7 @@

     int distortion;

     unsigned int sse;

     best_err = cpi->find_fractional_mv_step(

-        x, b, d,

+        x,

         dst_mv, ref_mv,

         x->errorperbit, &v_fn_ptr,

         NULL, NULL,

@@ -71,9 +61,10 @@

   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);

-  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);

-  best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,

-                          xd->predictor, 16, INT_MAX);

+  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);

+  best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,

+                          xd->plane[0].dst.buf, xd->plane[0].dst.stride,

+                          INT_MAX);

   /* restore UMV window */

   x->mv_col_min = tmp_col_min;

@@ -84,42 +75,20 @@

   return best_err;

-static int do_16x16_motion_search

-(

-  VP9_COMP *cpi,

-  int_mv *ref_mv,

-  int_mv *dst_mv,

-  YV12_BUFFER_CONFIG *buf,

-  int buf_mb_y_offset,

-  YV12_BUFFER_CONFIG *ref,

-  int mb_y_offset,

-  int mb_row,

-  int mb_col) {

-  MACROBLOCK   *const x  = &cpi->mb;

+static int do_16x16_motion_search(VP9_COMP *cpi,

+                                  int_mv *ref_mv, int_mv *dst_mv,

+                                  int buf_mb_y_offset, int mb_y_offset,

+                                  int mb_row, int mb_col) {

+  MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   unsigned int err, tmp_err;

   int_mv tmp_mv;

-  int n;

-  for (n = 0; n < 16; n++) {

-    BLOCKD *d = &xd->block[n];

-    BLOCK *b  = &x->block[n];

-    b->base_src   = &buf->y_buffer;

-    b->src_stride = buf->y_stride;

-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;

-    d->base_pre   = &ref->y_buffer;

-    d->pre_stride = ref->y_stride;

-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;

-  }

   // Try zero MV first

   // FIXME should really use something like near/nearest MV and/or MV prediction

-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;

-  xd->pre.y_stride = ref->y_stride;

-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,

-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);

+  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,

+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,

+                     INT_MAX);

   dst_mv->as_int = 0;

   // Test last reference frame using the previous best mv as the

@@ -126,7 +95,7 @@

   // starting point (best reference) for the search

   tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);

   if (tmp_err < err) {

-    err            = tmp_err;

+    err = tmp_err;

     dst_mv->as_int = tmp_mv.as_int;

@@ -147,51 +116,26 @@

   return err;

-static int do_16x16_zerozero_search

-(

-  VP9_COMP *cpi,

-  int_mv *dst_mv,

-  YV12_BUFFER_CONFIG *buf,

-  int buf_mb_y_offset,

-  YV12_BUFFER_CONFIG *ref,

-  int mb_y_offset

-) {

-  MACROBLOCK   *const x  = &cpi->mb;

+static int do_16x16_zerozero_search(VP9_COMP *cpi,

+                                    int_mv *dst_mv,

+                                    int buf_mb_y_offset, int mb_y_offset) {

+  MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   unsigned int err;

-  int n;

-  for (n = 0; n < 16; n++) {

-    BLOCKD *d = &xd->block[n];

-    BLOCK *b  = &x->block[n];

-    b->base_src   = &buf->y_buffer;

-    b->src_stride = buf->y_stride;

-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;

-    d->base_pre   = &ref->y_buffer;

-    d->pre_stride = ref->y_stride;

-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;

-  }

   // Try zero MV first

   // FIXME should really use something like near/nearest MV and/or MV prediction

-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;

-  xd->pre.y_stride = ref->y_stride;

-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,

-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);

+  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,

+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,

+                     INT_MAX);

   dst_mv->as_int = 0;

   return err;

-static int find_best_16x16_intra

-(

-  VP9_COMP *cpi,

-  YV12_BUFFER_CONFIG *buf,

-  int mb_y_offset,

-  MB_PREDICTION_MODE *pbest_mode

-) {

+static int find_best_16x16_intra(VP9_COMP *cpi,

+                                 int mb_y_offset,

+                                 MB_PREDICTION_MODE *pbest_mode) {

   MACROBLOCK   *const x  = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_PREDICTION_MODE best_mode = -1, mode;

@@ -201,11 +145,19 @@

   // we're intentionally not doing 4x4, we just want a rough estimate

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

     unsigned int err;

+    const int bwl = b_width_log2(BLOCK_SIZE_MB16X16),  bw = 4 << bwl;

+    const int bhl = b_height_log2(BLOCK_SIZE_MB16X16), bh = 4 << bhl;

     xd->mode_info_context->mbmi.mode = mode;

-    vp9_build_intra_predictors_mby(xd);

-    err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,

-                       buf->y_stride, best_err);

+    vp9_build_intra_predictors(x->plane[0].src.buf, x->plane[0].src.stride,

+                               xd->plane[0].dst.buf, xd->plane[0].dst.stride,

+                               xd->mode_info_context->mbmi.mode,

+                               bw, bh,

+                               xd->up_available, xd->left_available,

+                               xd->right_available);

+    err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,

+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);

     // find best

     if (err < best_err) {

       best_err  = err;

@@ -234,15 +186,21 @@

   int mb_row,

   int mb_col

) {

-  MACROBLOCK   *const x  = &cpi->mb;

+  MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   int intra_error;

+  VP9_COMMON *cm = &cpi->common;

   // FIXME in practice we're completely ignoring chroma here

-  xd->dst.y_buffer = buf->y_buffer + mb_y_offset;

+  x->plane[0].src.buf = buf->y_buffer + mb_y_offset;

+  x->plane[0].src.stride = buf->y_stride;

+  xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset;

+  xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride;

   // do intra 16x16 prediction

-  intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);

+  intra_error = find_best_16x16_intra(cpi, mb_y_offset,

+                                      &stats->ref[INTRA_FRAME].m.mode);

   if (intra_error <= 0)

     intra_error = 1;

   stats->ref[INTRA_FRAME].err = intra_error;

@@ -249,11 +207,14 @@

   // Golden frame MV search, if it exists and is different than last frame

   if (golden_ref) {

-    int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,

-                                                &stats->ref[GOLDEN_FRAME].m.mv,

-                                                buf, mb_y_offset,

-                                                golden_ref, gld_y_offset,

-                                                mb_row, mb_col);

+    int g_motion_error;

+    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;

+    xd->plane[0].pre[0].stride = golden_ref->y_stride;

+    g_motion_error = do_16x16_motion_search(cpi,

+                                            prev_golden_ref_mv,

+                                            &stats->ref[GOLDEN_FRAME].m.mv,

+                                            mb_y_offset, gld_y_offset,

+                                            mb_row, mb_col);

     stats->ref[GOLDEN_FRAME].err = g_motion_error;

   } else {

     stats->ref[GOLDEN_FRAME].err = INT_MAX;

@@ -262,17 +223,13 @@

   // Alt-ref frame MV search, if it exists and is different than last/golden frame

   if (alt_ref) {

-    // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,

-    //                                            &stats->ref[ALTREF_FRAME].m.mv,

-    //                                            buf, mb_y_offset,

-    //                                            alt_ref, arf_y_offset);

+    int a_motion_error;

+    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;

+    xd->plane[0].pre[0].stride = alt_ref->y_stride;

+    a_motion_error = do_16x16_zerozero_search(cpi,

+                                              &stats->ref[ALTREF_FRAME].m.mv,

+                                              mb_y_offset, arf_y_offset);

-    int a_motion_error =

-      do_16x16_zerozero_search(cpi,

-                               &stats->ref[ALTREF_FRAME].m.mv,

-                               buf, mb_y_offset,

-                               alt_ref, arf_y_offset);

     stats->ref[ALTREF_FRAME].err = a_motion_error;

   } else {

     stats->ref[ALTREF_FRAME].err = INT_MAX;

@@ -280,17 +237,15 @@

-static void update_mbgraph_frame_stats

-(

-  VP9_COMP *cpi,

-  MBGRAPH_FRAME_STATS *stats,

-  YV12_BUFFER_CONFIG *buf,

-  YV12_BUFFER_CONFIG *golden_ref,

-  YV12_BUFFER_CONFIG *alt_ref

-) {

-  MACROBLOCK   *const x  = &cpi->mb;

-  VP9_COMMON   *const cm = &cpi->common;

+static void update_mbgraph_frame_stats(VP9_COMP *cpi,

+                                       MBGRAPH_FRAME_STATS *stats,

+                                       YV12_BUFFER_CONFIG *buf,

+                                       YV12_BUFFER_CONFIG *golden_ref,

+                                       YV12_BUFFER_CONFIG *alt_ref) {

+  MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

+  VP9_COMMON *const cm = &cpi->common;

   int mb_col, mb_row, offset = 0;

   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;

   int_mv arf_top_mv, gld_top_mv;

@@ -302,14 +257,17 @@

   // Set up limit values for motion vectors to prevent them extending outside the UMV borders

   arf_top_mv.as_int = 0;

   gld_top_mv.as_int = 0;

-  x->mv_row_min     = -(VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND);

-  x->mv_row_max     = (cm->mb_rows - 1) * 16 + VP9BORDERINPIXELS

-                      - 16 - VP9_INTERP_EXTEND;

+  x->mv_row_min     = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);

+  x->mv_row_max     = (cm->mb_rows - 1) * 8 + VP9BORDERINPIXELS

+                      - 8 - VP9_INTERP_EXTEND;

   xd->up_available  = 0;

-  xd->dst.y_stride  = buf->y_stride;

-  xd->pre.y_stride  = buf->y_stride;

-  xd->dst.uv_stride = buf->uv_stride;

+  xd->plane[0].dst.stride  = buf->y_stride;

+  xd->plane[0].pre[0].stride  = buf->y_stride;

+  xd->plane[1].dst.stride = buf->uv_stride;

   xd->mode_info_context = &mi_local;

+  mi_local.mbmi.sb_type = BLOCK_SIZE_MB16X16;

+  mi_local.mbmi.ref_frame[0] = LAST_FRAME;

+  mi_local.mbmi.ref_frame[1] = NONE;

   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

     int_mv arf_left_mv, gld_left_mv;

@@ -320,9 +278,9 @@

     // Set up limit values for motion vectors to prevent them extending outside the UMV borders

     arf_left_mv.as_int = arf_top_mv.as_int;

     gld_left_mv.as_int = gld_top_mv.as_int;

-    x->mv_col_min      = -(VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND);

-    x->mv_col_max      = (cm->mb_cols - 1) * 16 + VP9BORDERINPIXELS

-                         - 16 - VP9_INTERP_EXTEND;

+    x->mv_col_min      = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);

+    x->mv_col_max      = (cm->mb_cols - 1) * 8 + VP9BORDERINPIXELS

+                         - 8 - VP9_INTERP_EXTEND;

     xd->left_available = 0;

     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

@@ -379,8 +337,7 @@

     for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;

          offset += cm->mb_cols, mb_row++) {

       for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-        MBGRAPH_MB_STATS *mb_stats =

-          &frame_stats->mb_stats[offset + mb_col];

+        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];

         int altref_err = mb_stats->ref[ALTREF_FRAME].err;

         int intra_err  = mb_stats->ref[INTRA_FRAME ].err;

@@ -387,9 +344,9 @@

         int golden_err = mb_stats->ref[GOLDEN_FRAME].err;

         // Test for altref vs intra and gf and that its mv was 0,0.

-        if ((altref_err > 1000) ||

-            (altref_err > intra_err) ||

-            (altref_err > golden_err)) {

+        if (altref_err > 1000 ||

+            altref_err > intra_err ||

+            altref_err > golden_err) {

           arf_not_zz[offset + mb_col]++;

@@ -404,10 +361,16 @@

       // goes in segment 0

       if (arf_not_zz[offset + mb_col]) {

         ncnt[0]++;

-        cpi->segmentation_map[offset + mb_col] = 0;

+        cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0;

+        cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0;

+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0;

+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 0;

       } else {

+        cpi->segmentation_map[offset * 4 + 2 * mb_col] = 1;

+        cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1;

+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1;

+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1;

         ncnt[1]++;

-        cpi->segmentation_map[offset + mb_col] = 1;

@@ -425,10 +388,10 @@

       cpi->static_mb_pct = 0;

     cpi->seg0_cnt = ncnt[0];

-    vp9_enable_segmentation((VP9_PTR) cpi);

+    vp9_enable_segmentation((VP9_PTR)cpi);

   } else {

     cpi->static_mb_pct = 0;

-    vp9_disable_segmentation((VP9_PTR) cpi);

+    vp9_disable_segmentation((VP9_PTR)cpi);

   // Free localy allocated storage

@@ -463,8 +426,7 @@

   // the ARF MC search backwards, to get optimal results for MV caching

   for (i = 0; i < n_frames; i++) {

     MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];

-    struct lookahead_entry *q_cur =

-      vp9_lookahead_peek(cpi->lookahead, i);

+    struct lookahead_entry *q_cur = vp9_lookahead_peek(cpi->lookahead, i);

     assert(q_cur != NULL);

--- a/vp9/encoder/vp9_mcomp.c

+++ b/vp9/encoder/vp9_mcomp.c

@@ -56,8 +56,9 @@

   MV v;

   v.row = mv->as_mv.row - ref->as_mv.row;

   v.col = mv->as_mv.col - ref->as_mv.col;

-  return ((mvjcost[vp9_get_mv_joint(v)] +

-           mvcost[0][v.row] + mvcost[1][v.col]) * weight) >> 7;

+  return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +

+                             mvcost[0][v.row] +

+                             mvcost[1][v.col]) * weight, 7);

 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],

@@ -66,9 +67,9 @@

     MV v;

     v.row = mv->as_mv.row - ref->as_mv.row;

     v.col = mv->as_mv.col - ref->as_mv.col;

-    return ((mvjcost[vp9_get_mv_joint(v)] +

-             mvcost[0][v.row] + mvcost[1][v.col]) *

-            error_per_bit + 4096) >> 13;

+    return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +

+                               mvcost[0][v.row] +

+                               mvcost[1][v.col]) * error_per_bit, 13);

   return 0;

@@ -79,9 +80,9 @@

     MV v;

     v.row = mv->as_mv.row - ref->as_mv.row;

     v.col = mv->as_mv.col - ref->as_mv.col;

-    return ((mvjsadcost[vp9_get_mv_joint(v)] +

-             mvsadcost[0][v.row] + mvsadcost[1][v.col]) *

-            error_per_bit + 128) >> 8;

+    return ROUND_POWER_OF_TWO((mvjsadcost[vp9_get_mv_joint(&v)] +

+                               mvsadcost[0][v.row] +

+                               mvsadcost[1][v.col]) * error_per_bit, 8);

   return 0;

@@ -222,7 +223,7 @@

 /* returns subpixel variance error function */

 #define DIST(r, c) \

-    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)

+    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, src_stride, &sse)

 /* checks if (r, c) has better score than previous best */

 #define CHECK_BETTER(v, r, c) \

@@ -238,7 +239,7 @@

     },                                                                   \

     v = INT_MAX;)

-int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,

                                              int_mv *bestmv, int_mv *ref_mv,

                                              int error_per_bit,

                                              const vp9_variance_fn_ptr_t *vfp,

@@ -245,7 +246,8 @@

                                              int *mvjcost, int *mvcost[2],

                                              int *distortion,

                                              unsigned int *sse1) {

-  uint8_t *z = (*(b->base_src) + b->src);

+  uint8_t *z = x->plane[0].src.buf;

+  int src_stride = x->plane[0].src.stride;

   MACROBLOCKD *xd = &x->e_mbd;

   int rr, rc, br, bc, hstep;

@@ -263,10 +265,12 @@

   int offset;

   int usehp = xd->allow_high_precision_mv;

-  uint8_t *y = *(d->base_pre) + d->pre +

-               (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  y_stride = d->pre_stride;

+  uint8_t *y = xd->plane[0].pre[0].buf +

+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +

+               bestmv->as_mv.col;

+  y_stride = xd->plane[0].pre[0].stride;

   rr = ref_mv->as_mv.row;

   rc = ref_mv->as_mv.col;

   br = bestmv->as_mv.row << 3;

@@ -288,7 +292,7 @@

   bestmv->as_mv.col <<= 3;

   // calculate central point error

-  besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);

+  besterr = vfp->vf(y, y_stride, z, src_stride, sse1);

   *distortion = besterr;

   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,

                          error_per_bit, xd->allow_high_precision_mv);

@@ -409,6 +413,200 @@

   return besterr;

+#undef DIST

+/* returns subpixel variance error function */

+#define DIST(r, c) \

+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \

+              z, src_stride, &sse, second_pred)

+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,

+                                 int_mv *bestmv, int_mv *ref_mv,

+                                 int error_per_bit,

+                                 const vp9_variance_fn_ptr_t *vfp,

+                                 int *mvjcost, int *mvcost[2],

+                                 int *distortion,

+                                 unsigned int *sse1,

+                                 const uint8_t *second_pred, int w, int h) {

+  uint8_t *z = x->plane[0].src.buf;

+  int src_stride = x->plane[0].src.stride;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int rr, rc, br, bc, hstep;

+  int tr, tc;

+  unsigned int besterr = INT_MAX;

+  unsigned int left, right, up, down, diag;

+  unsigned int sse;

+  unsigned int whichdir;

+  unsigned int halfiters = 4;

+  unsigned int quarteriters = 4;

+  unsigned int eighthiters = 4;

+  int thismse;

+  int maxc, minc, maxr, minr;

+  int y_stride;

+  int offset;

+  int usehp = xd->allow_high_precision_mv;

+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));

+  uint8_t *y = xd->plane[0].pre[0].buf +

+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +

+               bestmv->as_mv.col;

+  y_stride = xd->plane[0].pre[0].stride;

+  rr = ref_mv->as_mv.row;

+  rc = ref_mv->as_mv.col;

+  br = bestmv->as_mv.row << 3;

+  bc = bestmv->as_mv.col << 3;

+  hstep = 4;

+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -

+             ((1 << MV_MAX_BITS) - 1));

+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +

+             ((1 << MV_MAX_BITS) - 1));

+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -

+             ((1 << MV_MAX_BITS) - 1));

+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +

+             ((1 << MV_MAX_BITS) - 1));

+  tr = br;

+  tc = bc;

+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;

+  // central mv

+  bestmv->as_mv.row <<= 3;

+  bestmv->as_mv.col <<= 3;

+  // calculate central point error

+  // TODO(yunqingwang): central pointer error was already calculated in full-

+  // pixel search, and can be passed in this function.

+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);

+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);

+  *distortion = besterr;

+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,

+                         error_per_bit, xd->allow_high_precision_mv);

+  // Each subsequent iteration checks at least one point in

+  // common with the last iteration could be 2 ( if diag selected)

+  while (--halfiters) {

+    // 1/2 pel

+    CHECK_BETTER(left, tr, tc - hstep);

+    CHECK_BETTER(right, tr, tc + hstep);

+    CHECK_BETTER(up, tr - hstep, tc);

+    CHECK_BETTER(down, tr + hstep, tc);

+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+    switch (whichdir) {

+      case 0:

+        CHECK_BETTER(diag, tr - hstep, tc - hstep);

+        break;

+      case 1:

+        CHECK_BETTER(diag, tr - hstep, tc + hstep);

+        break;

+      case 2:

+        CHECK_BETTER(diag, tr + hstep, tc - hstep);

+        break;

+      case 3:

+        CHECK_BETTER(diag, tr + hstep, tc + hstep);

+        break;

+    }

+    // no reason to check the same one again.

+    if (tr == br && tc == bc)

+      break;

+    tr = br;

+    tc = bc;

+  }

+  // Each subsequent iteration checks at least one point in common with

+  // the last iteration could be 2 ( if diag selected) 1/4 pel

+  hstep >>= 1;

+  while (--quarteriters) {

+    CHECK_BETTER(left, tr, tc - hstep);

+    CHECK_BETTER(right, tr, tc + hstep);

+    CHECK_BETTER(up, tr - hstep, tc);

+    CHECK_BETTER(down, tr + hstep, tc);

+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+    switch (whichdir) {

+      case 0:

+        CHECK_BETTER(diag, tr - hstep, tc - hstep);

+        break;

+      case 1:

+        CHECK_BETTER(diag, tr - hstep, tc + hstep);

+        break;

+      case 2:

+        CHECK_BETTER(diag, tr + hstep, tc - hstep);

+        break;

+      case 3:

+        CHECK_BETTER(diag, tr + hstep, tc + hstep);

+        break;

+    }

+    // no reason to check the same one again.

+    if (tr == br && tc == bc)

+      break;

+    tr = br;

+    tc = bc;

+  }

+  if (xd->allow_high_precision_mv) {

+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);

+  } else {

+    usehp = 0;

+  }

+  if (usehp) {

+    hstep >>= 1;

+    while (--eighthiters) {

+      CHECK_BETTER(left, tr, tc - hstep);

+      CHECK_BETTER(right, tr, tc + hstep);

+      CHECK_BETTER(up, tr - hstep, tc);

+      CHECK_BETTER(down, tr + hstep, tc);

+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+      switch (whichdir) {

+        case 0:

+          CHECK_BETTER(diag, tr - hstep, tc - hstep);

+          break;

+        case 1:

+          CHECK_BETTER(diag, tr - hstep, tc + hstep);

+          break;

+        case 2:

+          CHECK_BETTER(diag, tr + hstep, tc - hstep);

+          break;

+        case 3:

+          CHECK_BETTER(diag, tr + hstep, tc + hstep);

+          break;

+      }

+      // no reason to check the same one again.

+      if (tr == br && tc == bc)

+        break;

+      tr = br;

+      tc = bc;

+    }

+  }

+  bestmv->as_mv.row = br;

+  bestmv->as_mv.col = bc;

+  vpx_free(comp_pred);

+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||

+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))

+    return INT_MAX;

+  return besterr;

+}

 #undef MVC

 #undef PRE

 #undef DIST

@@ -417,7 +615,7 @@

 #undef MIN

 #undef MAX

-int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_find_best_sub_pixel_step(MACROBLOCK *x,

                                  int_mv *bestmv, int_mv *ref_mv,

                                  int error_per_bit,

                                  const vp9_variance_fn_ptr_t *vfp,

@@ -428,7 +626,8 @@

   int_mv this_mv;

   int_mv orig_mv;

   int yrow_movedback = 0, ycol_movedback = 0;

-  uint8_t *z = (*(b->base_src) + b->src);

+  uint8_t *z = x->plane[0].src.buf;

+  int src_stride = x->plane[0].src.stride;

   int left, right, up, down, diag;

   unsigned int sse;

   int whichdir;

@@ -437,9 +636,10 @@

   MACROBLOCKD *xd = &x->e_mbd;

   int usehp = xd->allow_high_precision_mv;

-  uint8_t *y = *(d->base_pre) + d->pre +

-               (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  y_stride = d->pre_stride;

+  uint8_t *y = xd->plane[0].pre[0].buf +

+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +

+               bestmv->as_mv.col;

+  y_stride = xd->plane[0].pre[0].stride;

   // central mv

   bestmv->as_mv.row <<= 3;

@@ -448,7 +648,7 @@

   orig_mv = *bestmv;

   // calculate central point error

-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);

+  bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);

   *distortion = bestmse;

   bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit,

                          xd->allow_high_precision_mv);

@@ -456,7 +656,7 @@

   // go left then right and check error

   this_mv.as_mv.row = startmv.as_mv.row;

   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);

-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);

+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);

   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                                xd->allow_high_precision_mv);

@@ -468,7 +668,7 @@

   this_mv.as_mv.col += 8;

-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);

+  thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);

   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,

                                 error_per_bit, xd->allow_high_precision_mv);

@@ -482,7 +682,7 @@

   // go up then down and check error

   this_mv.as_mv.col = startmv.as_mv.col;

   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);

-  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);

+  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);

   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                              xd->allow_high_precision_mv);

@@ -494,7 +694,7 @@

   this_mv.as_mv.row += 8;

-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);

+  thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);

   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                                xd->allow_high_precision_mv);

@@ -516,23 +716,25 @@

     case 0:

       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, src_stride,

+                                    &sse);

       break;

     case 1:

       this_mv.as_mv.col += 4;

       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, src_stride,

+                                    &sse);

       break;

     case 2:

       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

       this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);

       break;

     case 3:

     default:

       this_mv.as_mv.col += 4;

       this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);

       break;

@@ -571,11 +773,11 @@

     this_mv.as_mv.col = startmv.as_mv.col - 2;

     thismse = vfp->svf(y, y_stride,

                        SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

+                       z, src_stride, &sse);

   } else {

     this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

     thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,

-                       b->src_stride, &sse);

+                       src_stride, &sse);

   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

@@ -591,7 +793,7 @@

   this_mv.as_mv.col += 4;

   thismse = vfp->svf(y, y_stride,

                      SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                     z, b->src_stride, &sse);

+                     z, src_stride, &sse);

   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,

                                 error_per_bit, xd->allow_high_precision_mv);

@@ -609,11 +811,11 @@

     this_mv.as_mv.row = startmv.as_mv.row - 2;

     thismse = vfp->svf(y, y_stride,

                        SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

+                       z, src_stride, &sse);

   } else {

     this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

     thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),

-                       z, b->src_stride, &sse);

+                       z, src_stride, &sse);

   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

@@ -628,7 +830,7 @@

   this_mv.as_mv.row += 4;

   thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                     z, b->src_stride, &sse);

+                     z, src_stride, &sse);

   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                                xd->allow_high_precision_mv);

@@ -655,10 +857,13 @@

         if (startmv.as_mv.col & 7) {

           this_mv.as_mv.col -= 2;

-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+          thismse = vfp->svf(y, y_stride,

+                             SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                             z, src_stride, &sse);

         } else {

           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

-          thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;

+          thismse = vfp->svf(y - 1, y_stride,

+                             SP(6), SP(this_mv.as_mv.row), z, src_stride, &sse);

       } else {

         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

@@ -665,10 +870,12 @@

         if (startmv.as_mv.col & 7) {

           this_mv.as_mv.col -= 2;

-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);

+          thismse = vfp->svf(y - y_stride, y_stride,

+                             SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);

         } else {

           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);

+          thismse = vfp->svf(y - y_stride - 1, y_stride,

+                             SP(6), SP(6), z, src_stride, &sse);

@@ -678,10 +885,13 @@

       if (startmv.as_mv.row & 7) {

         this_mv.as_mv.row -= 2;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+        thismse = vfp->svf(y, y_stride,

+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                           z, src_stride, &sse);

       } else {

         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);

+        thismse = vfp->svf(y - y_stride, y_stride,

+                           SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);

       break;

@@ -690,12 +900,13 @@

       if (startmv.as_mv.col & 7) {

         this_mv.as_mv.col -= 2;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                           z, b->src_stride, &sse);

+        thismse = vfp->svf(y, y_stride,

+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                           z, src_stride, &sse);

       } else {

         this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

         thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,

-                           b->src_stride, &sse);

+                           src_stride, &sse);

       break;

@@ -704,7 +915,7 @@

       this_mv.as_mv.row += 2;

       thismse = vfp->svf(y, y_stride,

                          SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                         z, b->src_stride, &sse);

+                         z, src_stride, &sse);

       break;

@@ -746,11 +957,11 @@

     this_mv.as_mv.col = startmv.as_mv.col - 1;

     thismse = vfp->svf(y, y_stride,

                        SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

+                       z, src_stride, &sse);

   } else {

     this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

     thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

+                       z, src_stride, &sse);

   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

@@ -765,7 +976,7 @@

   this_mv.as_mv.col += 2;

   thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                     z, b->src_stride, &sse);

+                     z, src_stride, &sse);

   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,

                                 error_per_bit, xd->allow_high_precision_mv);

@@ -781,10 +992,13 @@

   if (startmv.as_mv.row & 7) {

     this_mv.as_mv.row = startmv.as_mv.row - 1;

-    thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+    thismse = vfp->svf(y, y_stride,

+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                       z, src_stride, &sse);

   } else {

     this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

-    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

+    thismse = vfp->svf(y - y_stride, y_stride,

+                       SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);

   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

@@ -798,7 +1012,9 @@

   this_mv.as_mv.row += 2;

-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+  thismse = vfp->svf(y, y_stride,

+                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                     z, src_stride, &sse);

   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                                xd->allow_high_precision_mv);

@@ -824,10 +1040,14 @@

         if (startmv.as_mv.col & 7) {

           this_mv.as_mv.col -= 1;

-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+          thismse = vfp->svf(y, y_stride,

+                             SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                             z, src_stride, &sse);

         } else {

           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

-          thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;

+          thismse = vfp->svf(y - 1, y_stride,

+                             SP(7), SP(this_mv.as_mv.row),

+                             z, src_stride, &sse);

       } else {

         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

@@ -834,10 +1054,12 @@

         if (startmv.as_mv.col & 7) {

           this_mv.as_mv.col -= 1;

-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

+          thismse = vfp->svf(y - y_stride, y_stride,

+                             SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);

         } else {

           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);

+          thismse = vfp->svf(y - y_stride - 1, y_stride,

+                             SP(7), SP(7), z, src_stride, &sse);

@@ -847,10 +1069,13 @@

       if (startmv.as_mv.row & 7) {

         this_mv.as_mv.row -= 1;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+        thismse = vfp->svf(y, y_stride,

+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                           z, src_stride, &sse);

       } else {

         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

+        thismse = vfp->svf(y - y_stride, y_stride,

+                           SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);

       break;

@@ -859,10 +1084,13 @@

       if (startmv.as_mv.col & 7) {

         this_mv.as_mv.col -= 1;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+        thismse = vfp->svf(y, y_stride,

+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                           z, src_stride, &sse);

       } else {

         this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

-        thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+        thismse = vfp->svf(y - 1, y_stride,

+                           SP(7), SP(this_mv.as_mv.row), z, src_stride, &sse);

       break;

@@ -869,7 +1097,9 @@

     case 3:

       this_mv.as_mv.col += 1;

       this_mv.as_mv.row += 1;

-      thismse = vfp->svf(y, y_stride,  SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+      thismse = vfp->svf(y, y_stride,

+                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                         z, src_stride, &sse);

       break;

@@ -888,7 +1118,7 @@

 #undef SP

-int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_find_best_half_pixel_step(MACROBLOCK *x,

                                   int_mv *bestmv, int_mv *ref_mv,

                                   int error_per_bit,

                                   const vp9_variance_fn_ptr_t *vfp,

@@ -898,7 +1128,8 @@

   int bestmse = INT_MAX;

   int_mv startmv;

   int_mv this_mv;

-  uint8_t *z = (*(b->base_src) + b->src);

+  uint8_t *z = x->plane[0].src.buf;

+  int src_stride = x->plane[0].src.stride;

   int left, right, up, down, diag;

   unsigned int sse;

   int whichdir;

@@ -906,9 +1137,9 @@

   int y_stride;

   MACROBLOCKD *xd = &x->e_mbd;

-  uint8_t *y = *(d->base_pre) + d->pre +

-      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  y_stride = d->pre_stride;

+  uint8_t *y = xd->plane[0].pre[0].buf +

+      (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col;

+  y_stride = xd->plane[0].pre[0].stride;

   // central mv

   bestmv->as_mv.row <<= 3;

@@ -916,7 +1147,7 @@

   startmv = *bestmv;

   // calculate central point error

-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);

+  bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);

   *distortion = bestmse;

   bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit,

                          xd->allow_high_precision_mv);

@@ -924,7 +1155,7 @@

   // go left then right and check error

   this_mv.as_mv.row = startmv.as_mv.row;

   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);

-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);

+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);

   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                                xd->allow_high_precision_mv);

@@ -936,7 +1167,7 @@

   this_mv.as_mv.col += 8;

-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);

+  thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);

   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,

                                 error_per_bit, xd->allow_high_precision_mv);

@@ -950,7 +1181,7 @@

   // go up then down and check error

   this_mv.as_mv.col = startmv.as_mv.col;

   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);

-  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);

+  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);

   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                              xd->allow_high_precision_mv);

@@ -962,7 +1193,7 @@

   this_mv.as_mv.row += 8;

-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);

+  thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);

   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,

                                xd->allow_high_precision_mv);

@@ -981,23 +1212,25 @@

     case 0:

       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride,

+                                    z, src_stride, &sse);

       break;

     case 1:

       this_mv.as_mv.col += 4;

       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride,

+                                    z, src_stride, &sse);

       break;

     case 2:

       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

       this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);

       break;

     case 3:

     default:

       this_mv.as_mv.col += 4;

       this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);

+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);

       break;

@@ -1057,8 +1290,6 @@

 int vp9_hex_search

   MACROBLOCK *x,

-  BLOCK *b,

-  BLOCKD *d,

   int_mv *ref_mv,

   int_mv *best_mv,

   int search_param,

@@ -1068,13 +1299,14 @@

   int *mvjcost, int *mvcost[2],

   int_mv *center_mv

) {

+  const MACROBLOCKD* const xd = &x->e_mbd;

   MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };

   MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};

   int i, j;

-  uint8_t *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

-  int in_what_stride = d->pre_stride;

+  uint8_t *what = x->plane[0].src.buf;

+  int what_stride = x->plane[0].src.stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

   int br, bc;

   int_mv this_mv;

   unsigned int bestsad = 0x7fffffff;

@@ -1095,8 +1327,8 @@

   bc = ref_mv->as_mv.col;

   // Work out the start point for the search

-  base_offset = (uint8_t *)(*(d->base_pre) + d->pre);

-  this_offset = base_offset + (br * (d->pre_stride)) + bc;

+  base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);

+  this_offset = base_offset + (br * (xd->plane[0].pre[0].stride)) + bc;

   this_mv.as_mv.row = br;

   this_mv.as_mv.col = bc;

   bestsad = vfp->sdf(what, what_stride, this_offset,

@@ -1211,7 +1443,7 @@

 #undef CHECK_POINT

 #undef CHECK_BETTER

-int vp9_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_diamond_search_sad_c(MACROBLOCK *x,

                              int_mv *ref_mv, int_mv *best_mv,

                              int search_param, int sad_per_bit, int *num00,

                              vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,

@@ -1218,10 +1450,11 @@

                              int *mvcost[2], int_mv *center_mv) {

   int i, j, step;

-  uint8_t *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

+  const MACROBLOCKD* const xd = &x->e_mbd;

+  uint8_t *what = x->plane[0].src.buf;

+  int what_stride = x->plane[0].src.stride;

   uint8_t *in_what;

-  int in_what_stride = d->pre_stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

   uint8_t *best_address;

   int tot_steps;

@@ -1237,7 +1470,6 @@

   uint8_t *check_here;

   int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

   int_mv fcenter_mv;

   int *mvjsadcost = x->nmvjointsadcost;

@@ -1254,8 +1486,8 @@

   best_mv->as_mv.col = ref_col;

   // Work out the start point for the search

-  in_what = (uint8_t *)(*(d->base_pre) + d->pre +

-                        (ref_row * (d->pre_stride)) + ref_col);

+  in_what = (uint8_t *)(xd->plane[0].pre[0].buf +

+                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);

   best_address = in_what;

   // Check the starting position

@@ -1322,7 +1554,7 @@

                   xd->allow_high_precision_mv);

-int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_diamond_search_sadx4(MACROBLOCK *x,

                              int_mv *ref_mv, int_mv *best_mv, int search_param,

                              int sad_per_bit, int *num00,

                              vp9_variance_fn_ptr_t *fn_ptr,

@@ -1329,10 +1561,11 @@

                              int *mvjcost, int *mvcost[2], int_mv *center_mv) {

   int i, j, step;

-  uint8_t *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

+  const MACROBLOCKD* const xd = &x->e_mbd;

+  uint8_t *what = x->plane[0].src.buf;

+  int what_stride = x->plane[0].src.stride;

   uint8_t *in_what;

-  int in_what_stride = d->pre_stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

   uint8_t *best_address;

   int tot_steps;

@@ -1350,7 +1583,6 @@

   uint8_t *check_here;

   unsigned int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

   int_mv fcenter_mv;

   int *mvjsadcost = x->nmvjointsadcost;

@@ -1367,8 +1599,8 @@

   best_mv->as_mv.col = ref_col;

   // Work out the start point for the search

-  in_what = (uint8_t *)(*(d->base_pre) + d->pre +

-                        (ref_row * (d->pre_stride)) + ref_col);

+  in_what = (uint8_t *)(xd->plane[0].pre[0].buf +

+                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);

   best_address = in_what;

   // Check the starting position

@@ -1472,14 +1704,14 @@

 /* do_refine: If last step (1-away) of n-step search doesn't pick the center

               point as the best match, we will do a final 1-away diamond

               refining search  */

-int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,

-                           BLOCKD *d, int_mv *mvp_full, int step_param,

+int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,

+                           int_mv *mvp_full, int step_param,

                            int sadpb, int further_steps,

                            int do_refine, vp9_variance_fn_ptr_t *fn_ptr,

                            int_mv *ref_mv, int_mv *dst_mv) {

   int_mv temp_mv;

   int thissme, n, num00;

-  int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,

+  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,

                                         step_param, sadpb, &num00,

                                         fn_ptr, x->nmvjointcost,

                                         x->mvcost, ref_mv);

@@ -1498,7 +1730,7 @@

     if (num00)

       num00--;

     else {

-      thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,

+      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,

                                         step_param + n, sadpb, &num00,

                                         fn_ptr, x->nmvjointcost, x->mvcost,

                                         ref_mv);

@@ -1519,7 +1751,7 @@

     int search_range = 8;

     int_mv best_mv;

     best_mv.as_int = dst_mv->as_int;

-    thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,

+    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,

                                        fn_ptr, x->nmvjointcost, x->mvcost,

                                        ref_mv);

@@ -1531,18 +1763,19 @@

   return bestsme;

-int vp9_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

+int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,

                           int sad_per_bit, int distance,

                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,

                           int *mvcost[2],

-                          int_mv *center_mv) {

-  uint8_t *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

+                          int_mv *center_mv, int n) {

+  const MACROBLOCKD* const xd = &x->e_mbd;

+  uint8_t *what = x->plane[0].src.buf;

+  int what_stride = x->plane[0].src.stride;

   uint8_t *in_what;

-  int in_what_stride = d->pre_stride;

-  int mv_stride = d->pre_stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

+  int mv_stride = xd->plane[0].pre[0].stride;

   uint8_t *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv[0];

+  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];

   int_mv this_mv;

   int bestsad = INT_MAX;

   int r, c;

@@ -1549,7 +1782,6 @@

   uint8_t *check_here;

   int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

   int ref_row = ref_mv->as_mv.row;

   int ref_col = ref_mv->as_mv.col;

@@ -1567,8 +1799,8 @@

   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

   // Work out the mid point for the search

-  in_what = *(d->base_pre) + d->pre;

-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

+  in_what = xd->plane[0].pre[0].buf;

+  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;

   best_mv->as_mv.row = ref_row;

   best_mv->as_mv.col = ref_col;

@@ -1627,17 +1859,18 @@

     return INT_MAX;

-int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

+int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,

                           int sad_per_bit, int distance,

                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,

-                          int *mvcost[2], int_mv *center_mv) {

-  uint8_t *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

+                          int *mvcost[2], int_mv *center_mv, int n) {

+  const MACROBLOCKD* const xd = &x->e_mbd;

+  uint8_t *what = x->plane[0].src.buf;

+  int what_stride = x->plane[0].src.stride;

   uint8_t *in_what;

-  int in_what_stride = d->pre_stride;

-  int mv_stride = d->pre_stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

+  int mv_stride = xd->plane[0].pre[0].stride;

   uint8_t *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv[0];

+  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];

   int_mv this_mv;

   unsigned int bestsad = INT_MAX;

   int r, c;

@@ -1644,7 +1877,6 @@

   uint8_t *check_here;

   unsigned int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

   int ref_row = ref_mv->as_mv.row;

   int ref_col = ref_mv->as_mv.col;

@@ -1664,8 +1896,8 @@

   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

   // Work out the mid point for the search

-  in_what = *(d->base_pre) + d->pre;

-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

+  in_what = xd->plane[0].pre[0].buf;

+  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;

   best_mv->as_mv.row = ref_row;

   best_mv->as_mv.col = ref_col;

@@ -1755,18 +1987,19 @@

     return INT_MAX;

-int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

+int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,

                           int sad_per_bit, int distance,

                           vp9_variance_fn_ptr_t *fn_ptr,

                           int *mvjcost, int *mvcost[2],

-                          int_mv *center_mv) {

-  uint8_t *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

+                          int_mv *center_mv, int n) {

+  const MACROBLOCKD* const xd = &x->e_mbd;

+  uint8_t *what = x->plane[0].src.buf;

+  int what_stride = x->plane[0].src.stride;

   uint8_t *in_what;

-  int in_what_stride = d->pre_stride;

-  int mv_stride = d->pre_stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

+  int mv_stride = xd->plane[0].pre[0].stride;

   uint8_t *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv[0];

+  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];

   int_mv this_mv;

   unsigned int bestsad = INT_MAX;

   int r, c;

@@ -1773,7 +2006,6 @@

   uint8_t *check_here;

   unsigned int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

   int ref_row = ref_mv->as_mv.row;

   int ref_col = ref_mv->as_mv.col;

@@ -1794,8 +2026,8 @@

   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

   // Work out the mid point for the search

-  in_what = *(d->base_pre) + d->pre;

-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

+  in_what = xd->plane[0].pre[0].buf;

+  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;

   best_mv->as_mv.row = ref_row;

   best_mv->as_mv.col = ref_col;

@@ -1909,25 +2141,25 @@

   else

     return INT_MAX;

-int vp9_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_refining_search_sad_c(MACROBLOCK *x,

                               int_mv *ref_mv, int error_per_bit,

                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,

                               int *mvjcost, int *mvcost[2], int_mv *center_mv) {

+  const MACROBLOCKD* const xd = &x->e_mbd;

   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};

   int i, j;

   int this_row_offset, this_col_offset;

-  int what_stride = b->src_stride;

-  int in_what_stride = d->pre_stride;

-  uint8_t *what = (*(b->base_src) + b->src);

-  uint8_t *best_address = (uint8_t *)(*(d->base_pre) + d->pre +

-                                      (ref_mv->as_mv.row * (d->pre_stride)) +

-                                      ref_mv->as_mv.col);

+  int what_stride = x->plane[0].src.stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

+  uint8_t *what = x->plane[0].src.buf;

+  uint8_t *best_address = xd->plane[0].pre[0].buf +

+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +

+                          ref_mv->as_mv.col;

   uint8_t *check_here;

   unsigned int thissad;

   int_mv this_mv;

   unsigned int bestsad = INT_MAX;

-  MACROBLOCKD *xd = &x->e_mbd;

   int_mv fcenter_mv;

   int *mvjsadcost = x->nmvjointsadcost;

@@ -1987,25 +2219,25 @@

     return INT_MAX;

-int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_refining_search_sadx4(MACROBLOCK *x,

                               int_mv *ref_mv, int error_per_bit,

                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,

                               int *mvjcost, int *mvcost[2], int_mv *center_mv) {

+  const MACROBLOCKD* const xd = &x->e_mbd;

   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};

   int i, j;

   int this_row_offset, this_col_offset;

-  int what_stride = b->src_stride;

-  int in_what_stride = d->pre_stride;

-  uint8_t *what = (*(b->base_src) + b->src);

-  uint8_t *best_address = (uint8_t *)(*(d->base_pre) + d->pre +

-                                      (ref_mv->as_mv.row * (d->pre_stride)) +

-                                      ref_mv->as_mv.col);

+  int what_stride = x->plane[0].src.stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

+  uint8_t *what = x->plane[0].src.buf;

+  uint8_t *best_address = xd->plane[0].pre[0].buf +

+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +

+                          ref_mv->as_mv.col;

   uint8_t *check_here;

   unsigned int thissad;

   int_mv this_mv;

   unsigned int bestsad = INT_MAX;

-  MACROBLOCKD *xd = &x->e_mbd;

   int_mv fcenter_mv;

   int *mvjsadcost = x->nmvjointsadcost;

@@ -2094,33 +2326,104 @@

     return INT_MAX;

+/* This function is called when we do joint motion search in comp_inter_inter

+ * mode.

+ */

+int vp9_refining_search_8p_c(MACROBLOCK *x,

+                             int_mv *ref_mv, int error_per_bit,

+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,

+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,

+                             const uint8_t *second_pred, int w, int h) {

+  const MACROBLOCKD* const xd = &x->e_mbd;

+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},

+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};

+  int i, j;

+  int this_row_offset, this_col_offset;

+  int what_stride = x->plane[0].src.stride;

+  int in_what_stride = xd->plane[0].pre[0].stride;

+  uint8_t *what = x->plane[0].src.buf;

+  uint8_t *best_address = xd->plane[0].pre[0].buf +

+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +

+                          ref_mv->as_mv.col;

+  uint8_t *check_here;

+  unsigned int thissad;

+  int_mv this_mv;

+  unsigned int bestsad = INT_MAX;

+  int_mv fcenter_mv;

-#ifdef ENTROPY_STATS

-void print_mode_context(VP9_COMMON *pc) {

-  FILE *f = fopen("vp9_modecont.c", "a");

-  int i, j;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fprintf(f, "#include \"vp9_entropy.h\"\n");

-  fprintf(f, "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][4] =");

-  fprintf(f, "{\n");

-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {

-    fprintf(f, "  {/* %d */ ", j);

-    fprintf(f, "    ");

-    for (i = 0; i < 4; i++) {

-      int this_prob;

+  /* Compound pred buffer */

+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));

-      // context probs

-      this_prob = get_binary_prob(pc->fc.mv_ref_ct[j][i][0],

-                                  pc->fc.mv_ref_ct[j][i][1]);

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-      fprintf(f, "%5d, ", this_prob);

+  /* Get compound pred by averaging two pred blocks. */

+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);

+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +

+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);

+  for (i = 0; i < search_range; i++) {

+    int best_site = -1;

+    for (j = 0; j < 8; j++) {

+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;

+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;

+      if ((this_col_offset > x->mv_col_min) &&

+          (this_col_offset < x->mv_col_max) &&

+          (this_row_offset > x->mv_row_min) &&

+          (this_row_offset < x->mv_row_max)) {

+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +

+            best_address;

+        /* Get compound block and use it to calculate SAD. */

+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,

+                      in_what_stride);

+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);

+        if (thissad < bestsad) {

+          this_mv.as_mv.row = this_row_offset;

+          this_mv.as_mv.col = this_col_offset;

+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,

+                                    mvsadcost, error_per_bit);

+          if (thissad < bestsad) {

+            bestsad = thissad;

+            best_site = j;

+          }

+        }

+      }

-    fprintf(f, "  },\n");

+    if (best_site == -1) {

+      break;

+    } else {

+      ref_mv->as_mv.row += neighbors[best_site].row;

+      ref_mv->as_mv.col += neighbors[best_site].col;

+      best_address += (neighbors[best_site].row) * in_what_stride +

+          neighbors[best_site].col;

+    }

-  fprintf(f, "};\n");

-  fclose(f);

-}

+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;

+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;

-#endif/* END MV ref count ENTROPY_STATS stats code */

+  if (bestsad < INT_MAX) {

+    int besterr;

+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);

+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,

+        (unsigned int *)(&thissad)) +

+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,

+                    xd->allow_high_precision_mv);

+    vpx_free(comp_pred);

+    return besterr;

+  } else {

+    vpx_free(comp_pred);

+    return INT_MAX;

+  }

+}

--- a/vp9/encoder/vp9_mcomp.h

+++ b/vp9/encoder/vp9_mcomp.h

@@ -15,10 +15,6 @@

 #include "vp9/encoder/vp9_block.h"

 #include "vp9/encoder/vp9_variance.h"

-#ifdef ENTROPY_STATS

-void print_mode_context(VP9_COMMON *pc);

-#endif

 // The maximum number of steps in a step search given the largest

 // allowed initial step

 #define MAX_MVSEARCH_STEPS 11

@@ -37,13 +33,13 @@

 // Runs sequence of diamond searches in smaller steps for RD

 struct VP9_COMP;

-int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,

-                           BLOCKD *d, int_mv *mvp_full, int step_param,

+int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,

+                           int_mv *mvp_full, int step_param,

                            int sadpb, int further_steps, int do_refine,

                            vp9_variance_fn_ptr_t *fn_ptr,

                            int_mv *ref_mv, int_mv *dst_mv);

-int vp9_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+int vp9_hex_search(MACROBLOCK *x,

                    int_mv *ref_mv, int_mv *best_mv,

                    int search_param, int error_per_bit,

                    const vp9_variance_fn_ptr_t *vf,

@@ -51,7 +47,7 @@

                    int *mvjcost, int *mvcost[2],

                    int_mv *center_mv);

-typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv

+typedef int (fractional_mv_step_fp) (MACROBLOCK *x, int_mv

   *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp,

   int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse);

 extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;

@@ -58,13 +54,13 @@

 extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;

 extern fractional_mv_step_fp vp9_find_best_half_pixel_step;

-typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,

                                     int_mv *ref_mv, int sad_per_bit,

                                     int distance, vp9_variance_fn_ptr_t *fn_ptr,

                                     int *mvjcost, int *mvcost[2],

-                                    int_mv *center_mv);

+                                    int_mv *center_mv, int n);

-typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x,

                                         int_mv *ref_mv, int sad_per_bit,

                                         int distance,

                                         vp9_variance_fn_ptr_t *fn_ptr,

@@ -71,7 +67,7 @@

                                         int *mvjcost, int *mvcost[2],

                                         int_mv *center_mv);

-typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,

                                        int_mv *ref_mv, int_mv *best_mv,

                                        int search_param, int sad_per_bit,

                                        int *num00,

@@ -79,5 +75,19 @@

                                        int *mvjcost, int *mvcost[2],

                                        int_mv *center_mv);

+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,

+                                 int_mv *bestmv, int_mv *ref_mv,

+                                 int error_per_bit,

+                                 const vp9_variance_fn_ptr_t *vfp,

+                                 int *mvjcost, int *mvcost[2],

+                                 int *distortion, unsigned int *sse1,

+                                 const uint8_t *second_pred,

+                                 int w, int h);

+int vp9_refining_search_8p_c(MACROBLOCK *x,

+                             int_mv *ref_mv, int error_per_bit,

+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,

+                             int *mvjcost, int *mvcost[2],

+                             int_mv *center_mv, const uint8_t *second_pred,

+                             int w, int h);

 #endif  // VP9_ENCODER_VP9_MCOMP_H_

--- a/vp9/encoder/vp9_modecosts.c

+++ b/vp9/encoder/vp9_modecosts.c

@@ -17,32 +17,23 @@

 void vp9_init_mode_costs(VP9_COMP *c) {

   VP9_COMMON *x = &c->common;

-  const vp9_tree_p T = vp9_bmode_tree;

-  const vp9_tree_p KT = vp9_kf_bmode_tree;

+  const vp9_tree_p KT = vp9_intra_mode_tree;

   int i, j;

-  for (i = 0; i < VP9_KF_BINTRAMODES; i++) {

-    for (j = 0; j < VP9_KF_BINTRAMODES; j++) {

-      vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],

-                      x->kf_bmode_prob[i][j], KT);

+  for (i = 0; i < VP9_INTRA_MODES; i++) {

+    for (j = 0; j < VP9_INTRA_MODES; j++) {

+      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j],

+                      x->kf_y_mode_prob[i][j], KT);

-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);

-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs,

-                  x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);

   // TODO(rbultje) separate tables for superblock costing?

-  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);

-  vp9_cost_tokens(c->mb.mbmode_cost[0],

-                  x->kf_ymode_prob[c->common.kf_ymode_probs_index],

-                  vp9_kf_ymode_tree);

+  vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob[1],

+                  vp9_intra_mode_tree);

   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],

-                  x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);

+                  x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);

   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],

-                  x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);

-  vp9_cost_tokens(c->mb.i8x8_mode_costs,

-                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);

+                  x->kf_uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);

   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)

     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],

--- a/vp9/encoder/vp9_onyx_if.c

+++ b/vp9/encoder/vp9_onyx_if.c

@@ -32,7 +32,6 @@

 #include "vp9/common/vp9_postproc.h"

 #endif

 #include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_swapyv12buffer.h"

 #include "vpx_ports/vpx_timer.h"

 #include "vp9/common/vp9_seg_common.h"

@@ -97,16 +96,11 @@

 FILE *keyfile;

 #endif

-#if 0

-extern int skip_true_count;

-extern int skip_false_count;

-#endif

 #ifdef ENTROPY_STATS

-extern int intra_mode_stats[VP9_KF_BINTRAMODES]

-                           [VP9_KF_BINTRAMODES]

-                           [VP9_KF_BINTRAMODES];

+extern int intra_mode_stats[VP9_INTRA_MODES]

+                           [VP9_INTRA_MODES]

+                           [VP9_INTRA_MODES];

 #endif

 #ifdef NMV_STATS

@@ -113,13 +107,12 @@

 extern void init_nmvstats();

 extern void print_nmvstats();

 #endif

-#if CONFIG_CODE_NONZEROCOUNT

-#ifdef NZC_STATS

-extern void init_nzcstats();

-extern void print_nzcstats();

+#ifdef MODE_STATS

+extern void init_tx_count_stats();

+extern void write_tx_count_stats();

+extern void init_switchable_interp_stats();

+extern void write_switchable_interp_stats();

 #endif

-#endif

 #ifdef SPEEDSTATS

 unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

@@ -128,22 +121,9 @@

 #if defined(SECTIONBITS_OUTPUT)

 extern unsigned __int64 Sectionbits[500];

 #endif

-#ifdef MODE_STATS

-extern int64_t Sectionbits[500];

-extern unsigned int y_modes[VP9_YMODES];

-extern unsigned int i8x8_modes[VP9_I8X8_MODES];

-extern unsigned int uv_modes[VP9_UV_MODES];

-extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];

-extern unsigned int b_modes[B_MODE_COUNT];

-extern unsigned int inter_y_modes[MB_MODE_COUNT];

-extern unsigned int inter_uv_modes[VP9_UV_MODES];

-extern unsigned int inter_b_modes[B_MODE_COUNT];

-#endif

 extern void vp9_init_quantizer(VP9_COMP *cpi);

-static int base_skip_false_prob[QINDEX_RANGE][3];

 // Tables relating active max Q to active min Q

 static int kf_low_motion_minq[QINDEX_RANGE];

 static int kf_high_motion_minq[QINDEX_RANGE];

@@ -161,6 +141,11 @@

   const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,

                                 maxq);

+  // Special case handling to deal with the step from q2.0

+  // down to lossless mode represented by q 1.0.

+  if (minqtarget <= 2.0)

+    return 0;

   for (i = 0; i < QINDEX_RANGE; i++) {

     if (minqtarget <= vp9_convert_qindex_to_q(i))

       return i;

@@ -177,15 +162,16 @@

     kf_low_motion_minq[i] = calculate_minq_index(maxq,

-                                                 0.0000003,

-                                                 -0.000015,

-                                                 0.074,

+                                                 0.000001,

+                                                 -0.0004,

+                                                 0.15,

                                                  0.0);

     kf_high_motion_minq[i] = calculate_minq_index(maxq,

-                                                  0.0000004,

-                                                  -0.000125,

-                                                  0.14,

+                                                  0.000002,

+                                                  -0.0012,

+                                                  0.5,

                                                   0.0);

     gf_low_motion_minq[i] = calculate_minq_index(maxq,

                                                  0.0000015,

                                                  -0.0009,

@@ -214,52 +200,7 @@

     mb->mvsadcost = mb->nmvsadcost;

-static void init_base_skip_probs(void) {

-  int i;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    const double q = vp9_convert_qindex_to_q(i);

-    // Exponential decay caluclation of baseline skip prob with clamping

-    // Based on crude best fit of old table.

-    const int t = (int)(564.25 * pow(2.71828, (-0.012 * q)));

-    base_skip_false_prob[i][1] = clip_prob(t);

-    base_skip_false_prob[i][2] = clip_prob(t * 3 / 4);

-    base_skip_false_prob[i][0] = clip_prob(t * 5 / 4);

-  }

-}

-static void update_base_skip_probs(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  if (cm->frame_type != KEY_FRAME) {

-    vp9_update_skip_probs(cpi);

-    if (cpi->refresh_alt_ref_frame) {

-      int k;

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];

-      cpi->last_skip_probs_q[2] = cm->base_qindex;

-    } else if (cpi->refresh_golden_frame) {

-      int k;

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];

-      cpi->last_skip_probs_q[1] = cm->base_qindex;

-    } else {

-      int k;

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];

-      cpi->last_skip_probs_q[0] = cm->base_qindex;

-      // update the baseline table for the current q

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->base_skip_false_prob[cm->base_qindex][k] =

-          cm->mbskip_pred_probs[k];

-    }

-  }

-}

 void vp9_initialize_enc() {

   static int init_done = 0;

@@ -269,21 +210,17 @@

     vp9_init_quant_tables();

     vp9_init_me_luts();

     init_minq_luts();

-    init_base_skip_probs();

+    // init_base_skip_probs();

     init_done = 1;

-#ifdef PACKET_TESTING

-extern FILE *vpxlogc;

-#endif

 static void setup_features(VP9_COMP *cpi) {

   MACROBLOCKD *xd = &cpi->mb.e_mbd;

   // Set up default state for MB feature flags

+  xd->segmentation_enabled = 0;

-  xd->segmentation_enabled = 0;   // Default segmentation disabled

   xd->update_mb_segmentation_map = 0;

   xd->update_mb_segmentation_data = 0;

   vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));

@@ -300,21 +237,7 @@

   set_default_lf_deltas(cpi);

 static void dealloc_compressor_data(VP9_COMP *cpi) {

-  vpx_free(cpi->tplist);

-  cpi->tplist = NULL;

-  // Delete last frame MV storage buffers

-  vpx_free(cpi->lfmv);

-  cpi->lfmv = 0;

-  vpx_free(cpi->lf_ref_frame_sign_bias);

-  cpi->lf_ref_frame_sign_bias = 0;

-  vpx_free(cpi->lf_ref_frame);

-  cpi->lf_ref_frame = 0;

   // Delete sementation map

   vpx_free(cpi->segmentation_map);

   cpi->segmentation_map = 0;

@@ -326,20 +249,16 @@

   vpx_free(cpi->active_map);

   cpi->active_map = 0;

-  vp9_de_alloc_frame_buffers(&cpi->common);

+  vp9_free_frame_buffers(&cpi->common);

-  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);

-  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);

-  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);

+  vp9_free_frame_buffer(&cpi->last_frame_uf);

+  vp9_free_frame_buffer(&cpi->scaled_source);

+  vp9_free_frame_buffer(&cpi->alt_ref_buffer);

   vp9_lookahead_destroy(cpi->lookahead);

   vpx_free(cpi->tok);

   cpi->tok = 0;

-  // Structure used to monitor GF usage

-  vpx_free(cpi->gf_active_flags);

-  cpi->gf_active_flags = 0;

   // Activity mask based per mb zbin adjustments

   vpx_free(cpi->mb_activity_map);

   cpi->mb_activity_map = 0;

@@ -348,15 +267,6 @@

   vpx_free(cpi->mb.pip);

   cpi->mb.pip = 0;

-  vpx_free(cpi->twopass.total_stats);

-  cpi->twopass.total_stats = 0;

-  vpx_free(cpi->twopass.total_left_stats);

-  cpi->twopass.total_left_stats = 0;

-  vpx_free(cpi->twopass.this_frame_stats);

-  cpi->twopass.this_frame_stats = 0;

 // Computes a q delta (in "q index" terms) to get from a starting q value

@@ -394,7 +304,7 @@

   // Disable and clear down for KF

   if (cm->frame_type == KEY_FRAME) {

     // Clear down the global segmentation map

-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));

+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);

     xd->update_mb_segmentation_map = 0;

     xd->update_mb_segmentation_data = 0;

     cpi->static_mb_pct = 0;

@@ -407,7 +317,7 @@

   } else if (cpi->refresh_alt_ref_frame) {

     // If this is an alt ref frame

     // Clear down the global segmentation map

-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));

+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);

     xd->update_mb_segmentation_map = 0;

     xd->update_mb_segmentation_data = 0;

     cpi->static_mb_pct = 0;

@@ -437,9 +347,9 @@

       xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

-  }

-  // All other frames if segmentation has been enabled

-  else if (xd->segmentation_enabled) {

+  } else if (xd->segmentation_enabled) {

+    // All other frames if segmentation has been enabled

     // First normal frame in a valid gf or alt ref group

     if (cpi->common.frames_since_golden == 0) {

       // Set up segment features for normal frames in an arf group

@@ -451,7 +361,6 @@

         qi_delta = compute_qdelta(cpi, cpi->avg_q,

                                   (cpi->avg_q * 1.125));

         vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));

-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);

         vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);

         vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);

@@ -459,18 +368,17 @@

         // Segment coding disabled for compred testing

         if (high_q || (cpi->static_mb_pct == 100)) {

-          vp9_set_segref(xd, 1, ALTREF_FRAME);

+          vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);

           vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

           vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);

-      }

-      // Disable segmentation and clear down features if alt ref

-      // is not active for this group

-      else {

+      } else {

+        // Disable segmentation and clear down features if alt ref

+        // is not active for this group

         vp9_disable_segmentation((VP9_PTR)cpi);

-        vpx_memset(cpi->segmentation_map, 0,

-                   (cm->mb_rows * cm->mb_cols));

+        vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);

         xd->update_mb_segmentation_map = 0;

         xd->update_mb_segmentation_data = 0;

@@ -477,21 +385,20 @@

         vp9_clearall_segfeatures(xd);

-    }

+    } else if (cpi->is_src_frame_alt_ref) {

+      // Special case where we are coding over the top of a previous

+      // alt ref frame.

+      // Segment coding disabled for compred testing

-    // Special case where we are coding over the top of a previous

-    // alt ref frame.

-    // Segment coding disabled for compred testing

-    else if (cpi->is_src_frame_alt_ref) {

       // Enable ref frame features for segment 0 as well

       vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);

       vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

       // All mbs should use ALTREF_FRAME

-      vp9_clear_segref(xd, 0);

-      vp9_set_segref(xd, 0, ALTREF_FRAME);

-      vp9_clear_segref(xd, 1);

-      vp9_set_segref(xd, 1, ALTREF_FRAME);

+      vp9_clear_segdata(xd, 0, SEG_LVL_REF_FRAME);

+      vp9_set_segdata(xd, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);

+      vp9_clear_segdata(xd, 1, SEG_LVL_REF_FRAME);

+      vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);

       // Skip all MBs if high Q (0,0 mv and skip coeffs)

       if (high_q) {

@@ -500,9 +407,9 @@

       // Enable data udpate

       xd->update_mb_segmentation_data = 1;

-    }

-    // All other frames.

-    else {

+    } else {

+      // All other frames.

       // No updates.. leave things as they are.

       xd->update_mb_segmentation_map = 0;

       xd->update_mb_segmentation_data = 0;

@@ -510,6 +417,69 @@

+#ifdef ENTROPY_STATS

+void vp9_update_mode_context_stats(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int i, j;

+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =

+      cm->fc.inter_mode_counts;

+  int64_t (*mv_ref_stats)[VP9_INTER_MODES - 1][2] = cpi->mv_ref_stats;

+  FILE *f;

+  // Read the past stats counters

+  f = fopen("mode_context.bin",  "rb");

+  if (!f) {

+    vpx_memset(cpi->mv_ref_stats, 0, sizeof(cpi->mv_ref_stats));

+  } else {

+    fread(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);

+    fclose(f);

+  }

+  // Add in the values for this frame

+  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {

+    for (j = 0; j < VP9_INTER_MODES - 1; j++) {

+      mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0];

+      mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1];

+    }

+  }

+  // Write back the accumulated stats

+  f = fopen("mode_context.bin",  "wb");

+  fwrite(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);

+  fclose(f);

+}

+void print_mode_context(VP9_COMP *cpi) {

+  FILE *f = fopen("vp9_modecont.c", "a");

+  int i, j;

+  fprintf(f, "#include \"vp9_entropy.h\"\n");

+  fprintf(

+      f,

+      "const int inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1] =");

+  fprintf(f, "{\n");

+  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {

+    fprintf(f, "  {/* %d */ ", j);

+    fprintf(f, "    ");

+    for (i = 0; i < VP9_INTER_MODES - 1; i++) {

+      int this_prob;

+      int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1];

+      if (count)

+        this_prob = ((cpi->mv_ref_stats[j][i][0] * 256) + (count >> 1)) / count;

+      else

+        this_prob = 128;

+      // context probs

+      fprintf(f, "%5d, ", this_prob);

+    }

+    fprintf(f, "  },\n");

+  }

+  fprintf(f, "};\n");

+  fclose(f);

+}

+#endif  // ENTROPY_STATS

 // DEBUG: Print out the segment id of each MB in the current frame.

 static void print_seg_map(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

@@ -519,8 +489,8 @@

   fprintf(statsfile, "%10d\n", cm->current_video_frame);

-  for (row = 0; row < cpi->common.mb_rows; row++) {

-    for (col = 0; col < cpi->common.mb_cols; col++) {

+  for (row = 0; row < cpi->common.mi_rows; row++) {

+    for (col = 0; col < cpi->common.mi_cols; col++) {

       fprintf(statsfile, "%10d", cpi->segmentation_map[map_index]);

       map_index++;

@@ -537,14 +507,13 @@

   MODE_INFO *mi, *mi_ptr = cm->mi;

   uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;

-  for (row = 0; row < cm->mb_rows; row++) {

+  for (row = 0; row < cm->mi_rows; row++) {

     mi = mi_ptr;

     cache = cache_ptr;

-    for (col = 0; col < cm->mb_cols; col++, mi++, cache++) {

+    for (col = 0; col < cm->mi_cols; col++, mi++, cache++)

       cache[0] = mi->mbmi.segment_id;

-    }

     mi_ptr += cm->mode_info_stride;

-    cache_ptr += cm->mb_cols;

+    cache_ptr += cm->mi_cols;

@@ -561,10 +530,8 @@

   cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;

   cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;

-  cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED

-  cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero

-  cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv

-  cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv

+  cpi->mb.e_mbd.mode_lf_deltas[0] = 0;              // Zero

+  cpi->mb.e_mbd.mode_lf_deltas[1] = 0;               // New mv

 static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {

@@ -573,9 +540,8 @@

   int i;

   // Set baseline threshold values

-  for (i = 0; i < MAX_MODES; ++i) {

-    sf->thresh_mult[i] = (mode == 0) ? -500 : 0;

-  }

+  for (i = 0; i < MAX_MODES; ++i)

+    sf->thresh_mult[i] = mode == 0 ? -500 : 0;

   sf->thresh_mult[THR_ZEROMV   ] = 0;

   sf->thresh_mult[THR_ZEROG    ] = 0;

@@ -601,7 +567,6 @@

   sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;

   sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;

-  sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;

   sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;

   sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;

@@ -611,44 +576,40 @@

   sf->thresh_mult[THR_SPLITG   ] += speed_multiplier * 2500;

   sf->thresh_mult[THR_SPLITA   ] += speed_multiplier * 2500;

-  sf->thresh_mult[THR_COMP_ZEROLG   ] += speed_multiplier * 1500;

   sf->thresh_mult[THR_COMP_ZEROLA   ] += speed_multiplier * 1500;

   sf->thresh_mult[THR_COMP_ZEROGA   ] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_NEARESTLG] += speed_multiplier * 1500;

   sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500;

   sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_NEARLG   ] += speed_multiplier * 1500;

   sf->thresh_mult[THR_COMP_NEARLA   ] += speed_multiplier * 1500;

   sf->thresh_mult[THR_COMP_NEARGA   ] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_NEWLG    ] += speed_multiplier * 2000;

   sf->thresh_mult[THR_COMP_NEWLA    ] += speed_multiplier * 2000;

   sf->thresh_mult[THR_COMP_NEWGA    ] += speed_multiplier * 2000;

   sf->thresh_mult[THR_COMP_SPLITLA  ] += speed_multiplier * 4500;

   sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;

-  sf->thresh_mult[THR_COMP_SPLITLG  ] += speed_multiplier * 4500;

-#if CONFIG_COMP_INTERINTRA_PRED

-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] += speed_multiplier * 1500;

+  if (speed > 4) {

+    for (i = 0; i < MAX_MODES; ++i)

+      sf->thresh_mult[i] = INT_MAX;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += speed_multiplier * 1500;

+    sf->thresh_mult[THR_DC       ] = 0;

+    sf->thresh_mult[THR_TM       ] = 0;

+    sf->thresh_mult[THR_NEWMV    ] = 4000;

+    sf->thresh_mult[THR_NEWG     ] = 4000;

+    sf->thresh_mult[THR_NEWA     ] = 4000;

+    sf->thresh_mult[THR_NEARESTMV] = 0;

+    sf->thresh_mult[THR_NEARESTG ] = 0;

+    sf->thresh_mult[THR_NEARESTA ] = 0;

+    sf->thresh_mult[THR_NEARMV   ] = 2000;

+    sf->thresh_mult[THR_NEARG    ] = 2000;

+    sf->thresh_mult[THR_NEARA    ] = 2000;

+    sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;

+    sf->recode_loop = 0;

+  }

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] += speed_multiplier * 1500;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] += speed_multiplier * 2000;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] += speed_multiplier * 2000;

-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] += speed_multiplier * 2000;

-#endif

   /* disable frame modes if flags not set */

   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {

     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;

@@ -656,12 +617,6 @@

     sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;

     sf->thresh_mult[THR_NEARMV   ] = INT_MAX;

     sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;

-#if CONFIG_COMP_INTERINTRA_PRED

-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = INT_MAX;

-#endif

   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {

     sf->thresh_mult[THR_NEARESTG ] = INT_MAX;

@@ -669,12 +624,6 @@

     sf->thresh_mult[THR_NEARG    ] = INT_MAX;

     sf->thresh_mult[THR_NEWG     ] = INT_MAX;

     sf->thresh_mult[THR_SPLITG   ] = INT_MAX;

-#if CONFIG_COMP_INTERINTRA_PRED

-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = INT_MAX;

-#endif

   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {

     sf->thresh_mult[THR_NEARESTA ] = INT_MAX;

@@ -682,22 +631,8 @@

     sf->thresh_mult[THR_NEARA    ] = INT_MAX;

     sf->thresh_mult[THR_NEWA     ] = INT_MAX;

     sf->thresh_mult[THR_SPLITA   ] = INT_MAX;

-#if CONFIG_COMP_INTERINTRA_PRED

-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = INT_MAX;

-#endif

-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) !=

-      (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {

-    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;

-  }

   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=

       (VP9_LAST_FLAG | VP9_ALT_FLAG)) {

     sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;

@@ -719,7 +654,7 @@

 void vp9_set_speed_features(VP9_COMP *cpi) {

   SPEED_FEATURES *sf = &cpi->sf;

   int mode = cpi->compressor_speed;

-  int speed = cpi->Speed;

+  int speed = cpi->speed;

   int i;

   // Only modes 0 and 1 supported for now in experimental code basae

@@ -736,24 +671,24 @@

   // best quality defaults

   sf->RD = 1;

   sf->search_method = NSTEP;

-  sf->improved_dct = 1;

   sf->auto_filter = 1;

   sf->recode_loop = 1;

   sf->quarter_pixel_search = 1;

   sf->half_pixel_search = 1;

   sf->iterative_sub_pixel = 1;

-  sf->no_skip_block4x4_search = 1;

-  if (cpi->oxcf.lossless)

-    sf->optimize_coefficients = 0;

-  else

-    sf->optimize_coefficients = 1;

+  sf->optimize_coefficients = !cpi->oxcf.lossless;

   sf->first_step = 0;

   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

-  sf->static_segmentation = 1;

-  sf->splitmode_breakout = 0;

-  sf->mb16_breakout = 0;

+  sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;

+  sf->adpative_rd_thresh = 0;

+#if CONFIG_MULTIPLE_ARF

+  // Switch segmentation off.

+  sf->static_segmentation = 0;

+#else

+  sf->static_segmentation = 0;

+#endif

   switch (mode) {

     case 0: // best quality mode

       sf->search_best_filter = SEARCH_BEST_FILTER;

@@ -760,52 +695,19 @@

       break;

     case 1:

-      sf->static_segmentation = 1;

-      sf->splitmode_breakout = 1;

-      sf->mb16_breakout = 0;

+#if CONFIG_MULTIPLE_ARF

+      // Switch segmentation off.

+      sf->static_segmentation = 0;

+#else

+      sf->static_segmentation = 0;

+#endif

+      sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;

+      sf->adpative_rd_thresh = 1;

       if (speed > 0) {

-        /* Disable coefficient optimization above speed 0 */

+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;

         sf->optimize_coefficients = 0;

-        sf->no_skip_block4x4_search = 0;

         sf->first_step = 1;

-        cpi->mode_check_freq[THR_SPLITG] = 2;

-        cpi->mode_check_freq[THR_SPLITA] = 2;

-        cpi->mode_check_freq[THR_SPLITMV] = 0;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;

-      if (speed > 1) {

-        cpi->mode_check_freq[THR_SPLITG] = 4;

-        cpi->mode_check_freq[THR_SPLITA] = 4;

-        cpi->mode_check_freq[THR_SPLITMV] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;

-      }

-      if (speed > 2) {

-        cpi->mode_check_freq[THR_SPLITG] = 15;

-        cpi->mode_check_freq[THR_SPLITA] = 15;

-        cpi->mode_check_freq[THR_SPLITMV] = 7;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;

-        sf->improved_dct = 0;

-        // Only do recode loop on key frames, golden frames and

-        // alt ref frames

-        sf->recode_loop = 2;

-      }

       break;

   }; /* switch */

@@ -817,7 +719,6 @@

   // so make sure they are always turned off.

   if (cpi->pass == 1) {

     sf->optimize_coefficients = 0;

-    sf->improved_dct = 0;

   cpi->mb.fwd_txm16x16  = vp9_short_fdct16x16;

@@ -830,9 +731,6 @@

   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;

-  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;

-  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;

-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;

   vp9_init_quantizer(cpi);

@@ -844,26 +742,27 @@

     cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;

-  if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)

-    cpi->mb.optimize = 1;

-  else

-    cpi->mb.optimize = 0;

+  cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;

 #ifdef SPEEDSTATS

-  frames_at_speed[cpi->Speed]++;

+  frames_at_speed[cpi->speed]++;

 #endif

 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

   cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,

+                                      cm->subsampling_x, cm->subsampling_y,

                                       cpi->oxcf.lag_in_frames);

   if (!cpi->lookahead)

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate lag buffers");

-  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,

-                                  cpi->oxcf.width, cpi->oxcf.height,

-                                  VP9BORDERINPIXELS))

+  if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,

+                               cpi->oxcf.width, cpi->oxcf.height,

+                               cm->subsampling_x, cm->subsampling_y,

+                               VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate altref buffer");

@@ -871,8 +770,8 @@

 static int alloc_partition_data(VP9_COMP *cpi) {

   vpx_free(cpi->mb.pip);

-  cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *

-                           (cpi->common.mb_rows + 1),

+  cpi->mb.pip = vpx_calloc((cpi->common.mode_info_stride) *

+                           (cpi->common.mi_rows + 64 / MI_SIZE),

                            sizeof(PARTITION_INFO));

   if (!cpi->mb.pip)

     return 1;

@@ -893,13 +792,17 @@

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate partition data");

-  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,

-                                  cm->width, cm->height, VP9BORDERINPIXELS))

+  if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,

+                             cm->width, cm->height,

+                             cm->subsampling_x, cm->subsampling_y,

+                             VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate last frame buffer");

-  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,

-                                  cm->width, cm->height, VP9BORDERINPIXELS))

+  if (vp9_alloc_frame_buffer(&cpi->scaled_source,

+                             cm->width, cm->height,

+                             cm->subsampling_x, cm->subsampling_y,

+                             VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate scaled source buffer");

@@ -906,7 +809,7 @@

   vpx_free(cpi->tok);

-    unsigned int tokens = cm->mb_rows * cm->mb_cols * (24 * 16 + 1);

+    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);

     CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));

@@ -916,13 +819,6 @@

   cpi->gf_bad_count = 0;

   cpi->gf_update_recommended = 0;

-  // Structures used to minitor GF usage

-  vpx_free(cpi->gf_active_flags);

-  CHECK_MEM_ERROR(cpi->gf_active_flags,

-                  vpx_calloc(1, cm->mb_rows * cm->mb_cols));

-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

   vpx_free(cpi->mb_activity_map);

   CHECK_MEM_ERROR(cpi->mb_activity_map,

                   vpx_calloc(sizeof(unsigned int),

@@ -932,28 +828,6 @@

   CHECK_MEM_ERROR(cpi->mb_norm_activity_map,

                   vpx_calloc(sizeof(unsigned int),

                              cm->mb_rows * cm->mb_cols));

-  vpx_free(cpi->twopass.total_stats);

-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

-  vpx_free(cpi->twopass.total_left_stats);

-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

-  vpx_free(cpi->twopass.this_frame_stats);

-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

-  if (!cpi->twopass.total_stats ||

-      !cpi->twopass.total_left_stats ||

-      !cpi->twopass.this_frame_stats)

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate firstpass stats");

-  vpx_free(cpi->tplist);

-  CHECK_MEM_ERROR(cpi->tplist,

-                  vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));

@@ -960,30 +834,20 @@

 static void update_frame_size(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

-  /* our internal buffers are always multiples of 16 */

-  int aligned_width = (cm->width + 15) & ~15;

-  int aligned_height = (cm->height + 15) & ~15;

+  vp9_update_frame_size(cm);

-  cm->mb_rows = aligned_height >> 4;

-  cm->mb_cols = aligned_width >> 4;

-  cm->MBs = cm->mb_rows * cm->mb_cols;

-  cm->mode_info_stride = cm->mb_cols + 1;

-  memset(cm->mip, 0,

-        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));

-  vp9_update_mode_info_border(cm, cm->mip);

-  cm->mi = cm->mip + cm->mode_info_stride + 1;

-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;

-  vp9_update_mode_info_in_image(cm, cm->mi);

-  /* Update size of buffers local to this frame */

-  if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf,

-                                    cm->width, cm->height, VP9BORDERINPIXELS))

+  // Update size of buffers local to this frame

+  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,

+                               cm->width, cm->height,

+                               cm->subsampling_x, cm->subsampling_y,

+                               VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to reallocate last frame buffer");

-  if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source,

-                                    cm->width, cm->height, VP9BORDERINPIXELS))

+  if (vp9_realloc_frame_buffer(&cpi->scaled_source,

+                               cm->width, cm->height,

+                               cm->subsampling_x, cm->subsampling_y,

+                               VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to reallocate scaled source buffer");

@@ -1026,7 +890,7 @@

   return 63;

};

 void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {

-  if (framerate < .1)

+  if (framerate < 0.1)

     framerate = 30;

   cpi->oxcf.frame_rate             = framerate;

@@ -1035,9 +899,9 @@

   cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);

   cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

-  if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)

-    cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;

+  cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);

   // Set Maximum gf/arf interval

   cpi->max_gf_interval = 16;

@@ -1074,10 +938,10 @@

   vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles);

   max_log2_tiles += min_log2_tiles;

-  if (cm->log2_tile_columns < min_log2_tiles)

-    cm->log2_tile_columns = min_log2_tiles;

-  else if (cm->log2_tile_columns > max_log2_tiles)

-    cm->log2_tile_columns = max_log2_tiles;

+  cm->log2_tile_columns = clamp(cm->log2_tile_columns,

+                                min_log2_tiles, max_log2_tiles);

   cm->tile_columns = 1 << cm->log2_tile_columns;

   cm->tile_rows = 1 << cm->log2_tile_rows;

@@ -1085,16 +949,18 @@

 static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

   VP9_COMP *cpi = (VP9_COMP *)(ptr);

   VP9_COMMON *const cm = &cpi->common;

+  int i;

   cpi->oxcf = *oxcf;

   cpi->goldfreq = 7;

   cm->version = oxcf->version;

-  vp9_setup_version(cm);

   cm->width = oxcf->width;

   cm->height = oxcf->height;

+  cm->subsampling_x = 0;

+  cm->subsampling_y = 0;

+  vp9_alloc_compressor_data(cpi);

   // change includes all joint functionality

   vp9_change_config(ptr, oxcf);

@@ -1124,12 +990,9 @@

   set_tile_limits(cpi);

-  {

-    int i;

-    cpi->fixed_divide[0] = 0;

-    for (i = 1; i < 512; i++)

-      cpi->fixed_divide[i] = 0x80000 / i;

-  }

+  cpi->fixed_divide[0] = 0;

+  for (i = 1; i < 512; i++)

+    cpi->fixed_divide[i] = 0x80000 / i;

@@ -1142,7 +1005,6 @@

   if (cm->version != oxcf->version) {

     cm->version = oxcf->version;

-    vp9_setup_version(cm);

   cpi->oxcf = *oxcf;

@@ -1157,13 +1019,7 @@

     case MODE_SECONDPASS:

       cpi->pass = 2;

       cpi->compressor_speed = 1;

-      if (cpi->oxcf.cpu_used < -5) {

-        cpi->oxcf.cpu_used = -5;

-      }

-      if (cpi->oxcf.cpu_used > 5)

-        cpi->oxcf.cpu_used = 5;

+      cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);

       break;

     case MODE_SECONDPASS_BEST:

@@ -1178,11 +1034,11 @@

   cpi->oxcf.lossless = oxcf->lossless;

   if (cpi->oxcf.lossless) {

-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;

   } else {

-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;

   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

@@ -1193,7 +1049,8 @@

   // cpi->use_last_frame_only = 0;

   cpi->refresh_golden_frame = 0;

   cpi->refresh_last_frame = 1;

-  cm->refresh_entropy_probs = 1;

+  cm->refresh_frame_context = 1;

+  cm->reset_frame_context = 0;

   setup_features(cpi);

   cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation

@@ -1207,8 +1064,7 @@

   // At the moment the first order values may not be > MAXQ

-  if (cpi->oxcf.fixed_q > MAXQ)

-    cpi->oxcf.fixed_q = MAXQ;

+  cpi->oxcf.fixed_q = MIN(cpi->oxcf.fixed_q, MAXQ);

   // local file playback mode == really big buffer

   if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {

@@ -1244,29 +1100,19 @@

   cpi->best_quality = cpi->oxcf.best_allowed_q;

   // active values should only be modified if out of new range

-  if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {

-    cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;

-  }

-  // less likely

-  else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {

-    cpi->active_worst_quality = cpi->oxcf.best_allowed_q;

-  }

-  if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {

-    cpi->active_best_quality = cpi->oxcf.best_allowed_q;

-  }

-  // less likely

-  else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {

-    cpi->active_best_quality = cpi->oxcf.worst_allowed_q;

-  }

+  cpi->active_worst_quality = clamp(cpi->active_worst_quality,

+                                    cpi->oxcf.best_allowed_q,

+                                    cpi->oxcf.worst_allowed_q);

-  cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;

+  cpi->active_best_quality = clamp(cpi->active_best_quality,

+                                   cpi->oxcf.best_allowed_q,

+                                   cpi->oxcf.worst_allowed_q);

+  cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;

   cpi->cq_target_quality = cpi->oxcf.cq_level;

-  if (!cm->use_bilinear_mc_filter)

-    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;

-  else

-    cm->mcomp_filter_type = BILINEAR;

+  cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;

   cpi->target_bandwidth = cpi->oxcf.target_bandwidth;

@@ -1274,22 +1120,17 @@

   cm->display_height = cpi->oxcf.height;

   // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)

-  if (cpi->oxcf.Sharpness > 7)

-    cpi->oxcf.Sharpness = 7;

+  cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness);

   cm->sharpness_level = cpi->oxcf.Sharpness;

-  // Increasing the size of the frame beyond the first seen frame, or some

-  // otherwise signalled maximum size, is not supported.

-  // TODO(jkoleszar): exit gracefully.

-  if (!cpi->initial_width) {

-    alloc_raw_frame_buffers(cpi);

-    vp9_alloc_compressor_data(cpi);

-    cpi->initial_width = cm->width;

-    cpi->initial_height = cm->height;

+  if (cpi->initial_width) {

+    // Increasing the size of the frame beyond the first seen frame, or some

+    // otherwise signalled maximum size, is not supported.

+    // TODO(jkoleszar): exit gracefully.

+    assert(cm->width <= cpi->initial_width);

+    assert(cm->height <= cpi->initial_height);

-  assert(cm->width <= cpi->initial_width);

-  assert(cm->height <= cpi->initial_height);

   update_frame_size(cpi);

   if (cpi->oxcf.fixed_q >= 0) {

@@ -1298,18 +1139,22 @@

     cpi->last_boosted_qindex = cpi->oxcf.fixed_q;

-  cpi->Speed = cpi->oxcf.cpu_used;

+  cpi->speed = cpi->oxcf.cpu_used;

-  // force to allowlag to 0 if lag_in_frames is 0;

   if (cpi->oxcf.lag_in_frames == 0) {

+    // force to allowlag to 0 if lag_in_frames is 0;

     cpi->oxcf.allow_lag = 0;

-  }

-  // Limit on lag buffers as these are not currently dynamically allocated

-  else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)

+  } else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) {

+     // Limit on lag buffers as these are not currently dynamically allocated

     cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;

+  }

   // YX Temp

+#if CONFIG_MULTIPLE_ARF

+  vp9_zero(cpi->alt_ref_source);

+#else

   cpi->alt_ref_source = NULL;

+#endif

   cpi->is_src_frame_alt_ref = 0;

 #if 0

@@ -1396,30 +1241,13 @@

   init_config((VP9_PTR)cpi, oxcf);

-  memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));

   cpi->common.current_video_frame   = 0;

   cpi->kf_overspend_bits            = 0;

   cpi->kf_bitrate_adjustment        = 0;

-  cpi->frames_till_gf_update_due      = 0;

+  cpi->frames_till_gf_update_due    = 0;

   cpi->gf_overspend_bits            = 0;

-  cpi->non_gf_bitrate_adjustment     = 0;

-  cm->prob_last_coded               = 128;

-  cm->prob_gf_coded                 = 128;

-  cm->prob_intra_coded              = 63;

-  cm->sb32_coded                    = 200;

-  cm->sb64_coded                    = 200;

-  for (i = 0; i < COMP_PRED_CONTEXTS; i++)

-    cm->prob_comppred[i]         = 128;

-  for (i = 0; i < TX_SIZE_MAX_SB - 1; i++)

-    cm->prob_tx[i]               = 128;

+  cpi->non_gf_bitrate_adjustment    = 0;

-  // Prime the recent reference frame useage counters.

-  // Hereafter they will be maintained as a sort of moving average

-  cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;

-  cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;

-  cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;

-  cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;

   // Set reference frame sign bias for ALTREF frame to 1 (for now)

   cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;

@@ -1429,22 +1257,18 @@

   cpi->alt_is_last  = 0;

   cpi->gold_is_alt  = 0;

-  // allocate memory for storing last frame's MVs for MV prediction.

-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));

-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));

-  CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));

   // Create the encoder segmentation map and set all entries to 0

-  CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

+  CHECK_MEM_ERROR(cpi->segmentation_map,

+                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));

   // And a copy in common for temporal coding

   CHECK_MEM_ERROR(cm->last_frame_seg_map,

-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

+                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));

   // And a place holder structure is the coding context

   // for use if we want to save and restore it

   CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,

-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

+                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));

   CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));

   vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));

@@ -1462,24 +1286,14 @@

   if (cpi->pass != 1)

     init_context_counters();

 #endif

-#ifdef MODE_STATS

-  vp9_zero(y_modes);

-  vp9_zero(i8x8_modes);

-  vp9_zero(uv_modes);

-  vp9_zero(uv_modes_y);

-  vp9_zero(b_modes);

-  vp9_zero(inter_y_modes);

-  vp9_zero(inter_uv_modes);

-  vp9_zero(inter_b_modes);

-#endif

 #ifdef NMV_STATS

   init_nmvstats();

 #endif

-#if CONFIG_CODE_NONZEROCOUNT

-#ifdef NZC_STATS

-  init_nzcstats();

+#ifdef MODE_STATS

+  init_tx_count_stats();

+  init_switchable_interp_stats();

 #endif

-#endif

   /*Initialize the feed-forward activity masking.*/

   cpi->activity_avg = 90 << 12;

@@ -1486,13 +1300,26 @@

   cpi->frames_since_key = 8;        // Give a sensible default for the first frame.

   cpi->key_frame_frequency = cpi->oxcf.key_freq;

-  cpi->this_key_frame_forced = FALSE;

-  cpi->next_key_frame_forced = FALSE;

+  cpi->this_key_frame_forced = 0;

+  cpi->next_key_frame_forced = 0;

-  cpi->source_alt_ref_pending = FALSE;

-  cpi->source_alt_ref_active = FALSE;

+  cpi->source_alt_ref_pending = 0;

+  cpi->source_alt_ref_active = 0;

   cpi->refresh_alt_ref_frame = 0;

+#if CONFIG_MULTIPLE_ARF

+  // Turn multiple ARF usage on/off. This is a quick hack for the initial test

+  // version. It should eventually be set via the codec API.

+  cpi->multi_arf_enabled = 1;

+  if (cpi->multi_arf_enabled) {

+    cpi->sequence_number = 0;

+    cpi->frame_coding_order_period = 0;

+    vp9_zero(cpi->frame_coding_order);

+    vp9_zero(cpi->arf_buffer_idx);

+  }

+#endif

   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;

 #if CONFIG_INTERNAL_STATS

   cpi->b_calculate_ssimg = 0;

@@ -1514,6 +1341,8 @@

     cpi->tot_recode_hits = 0;

     cpi->summed_quality = 0;

     cpi->summed_weights = 0;

+    cpi->summedp_quality = 0;

+    cpi->summedp_weights = 0;

   if (cpi->b_calculate_ssimg) {

@@ -1555,9 +1384,8 @@

   cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];

   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);

-  for (i = 0; i < KEY_FRAME_CONTEXT; i++) {

+  for (i = 0; i < KEY_FRAME_CONTEXT; i++)

     cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;

-  }

 #ifdef OUTPUT_YUV_SRC

   yuv_file = fopen("bd.yuv", "ab");

@@ -1589,14 +1417,14 @@

   vp9_set_speed_features(cpi);

   // Set starting values of RD threshold multipliers (128 = *1)

-  for (i = 0; i < MAX_MODES; i++) {

+  for (i = 0; i < MAX_MODES; i++)

     cpi->rd_thresh_mult[i] = 128;

-  }

-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \

+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\

     cpi->fn_ptr[BT].sdf            = SDF; \

     cpi->fn_ptr[BT].vf             = VF; \

     cpi->fn_ptr[BT].svf            = SVF; \

+    cpi->fn_ptr[BT].svaf           = SVAF; \

     cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \

     cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \

     cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \

@@ -1604,33 +1432,69 @@

     cpi->fn_ptr[BT].sdx8f          = SDX8F; \

     cpi->fn_ptr[BT].sdx4df         = SDX4DF;

+  BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,

+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,

+      NULL, NULL, NULL,

+      vp9_sad32x16x4d)

+  BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,

+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,

+      NULL, NULL, NULL,

+      vp9_sad16x32x4d)

+  BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,

+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,

+      NULL, NULL, NULL,

+      vp9_sad64x32x4d)

+  BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,

+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,

+      NULL, NULL, NULL,

+      vp9_sad32x64x4d)

   BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,

-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,

+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,

+      vp9_variance_halfpixvar32x32_v,

       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,

       vp9_sad32x32x4d)

   BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,

-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,

+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,

+      vp9_variance_halfpixvar64x64_v,

       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,

       vp9_sad64x64x4d)

   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,

-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,

-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,

-       vp9_sad16x16x4d)

+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,

+      vp9_variance_halfpixvar16x16_v,

+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,

+      vp9_sad16x16x4d)

   BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,

-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)

+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,

+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)

   BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,

-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)

+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,

+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)

   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,

-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)

+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,

+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)

+  BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,

+      vp9_sub_pixel_avg_variance8x4, NULL, NULL,

+      NULL, NULL, vp9_sad8x4x8,

+      vp9_sad8x4x4d)

+  BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,

+      vp9_sub_pixel_avg_variance4x8, NULL, NULL,

+      NULL, NULL, vp9_sad4x8x8,

+      vp9_sad4x8x4d)

   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,

-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,

+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

   cpi->full_search_sad = vp9_full_search_sad;

   cpi->diamond_search_sad = vp9_diamond_search_sad;

@@ -1651,13 +1515,6 @@

   cpi->common.error.setjmp = 0;

   vp9_zero(cpi->y_uv_mode_count)

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_zero(cm->fc.nzc_counts_4x4);

-  vp9_zero(cm->fc.nzc_counts_8x8);

-  vp9_zero(cm->fc.nzc_counts_16x16);

-  vp9_zero(cm->fc.nzc_counts_32x32);

-  vp9_zero(cm->fc.nzc_pcat_counts);

-#endif

   return (VP9_PTR) cpi;

@@ -1678,7 +1535,7 @@

     if (cpi->pass != 1) {

       print_context_counters();

       print_tree_update_probs();

-      print_mode_context(&cpi->common);

+      print_mode_context(cpi);

 #endif

 #ifdef NMV_STATS

@@ -1685,12 +1542,12 @@

     if (cpi->pass != 1)

       print_nmvstats();

 #endif

-#if CONFIG_CODE_NONZEROCOUNT

-#ifdef NZC_STATS

-    if (cpi->pass != 1)

-      print_nzcstats();

+#ifdef MODE_STATS

+    if (cpi->pass != 1) {

+      write_tx_count_stats();

+      write_switchable_interp_stats();

+    }

 #endif

-#endif

 #if CONFIG_INTERNAL_STATS

@@ -1703,24 +1560,29 @@

                              - cpi->first_time_stamp_ever) / 10000000.000;

       double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;

       double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;

-#if defined(MODE_STATS)

-      print_mode_contexts(&cpi->common);

-#endif

       if (cpi->b_calculate_psnr) {

         YV12_BUFFER_CONFIG *lst_yv12 =

             &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]];

-        double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;

+        double samples = 3.0 / 2 * cpi->count *

+                         lst_yv12->y_width * lst_yv12->y_height;

         double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);

         double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);

-        double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);

+        double total_ssim = 100 * pow(cpi->summed_quality /

+                                      cpi->summed_weights, 8.0);

+        double total_ssimp = 100 * pow(cpi->summedp_quality /

+                                       cpi->summedp_weights, 8.0);

-        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(ms)\n");

-        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",

-                dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,

+        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"

+                "VPXSSIM\tVPSSIMP\t  Time(ms)\n");

+        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",

+                dr, cpi->total / cpi->count, total_psnr,

+                cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,

                 total_encode_time);

-//                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",

-//                        dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,

-//                        total_encode_time, cpi->tot_recode_hits);

+//         fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",

+//                 dr, cpi->total / cpi->count, total_psnr,

+//                 cpi->totalp / cpi->count, total_psnr2, total_ssim,

+//                 total_encode_time, cpi->tot_recode_hits);

       if (cpi->b_calculate_ssimg) {

@@ -1738,88 +1600,6 @@

 #endif

-#ifdef MODE_STATS

-    {

-      extern int count_mb_seg[4];

-      char modes_stats_file[250];

-      FILE *f;

-      double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;

-      sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);

-      f = fopen(modes_stats_file, "w");

-      fprintf(f, "intra_mode in Intra Frames:\n");

-      {

-        int i;

-        fprintf(f, "Y: ");

-        for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "I8: ");

-        for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "UV: ");

-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i, j;

-        fprintf(f, "KeyFrame Y-UV:\n");

-        for (i = 0; i < VP9_YMODES; i++) {

-          fprintf(f, "%2d:", i);

-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);

-          fprintf(f, "\n");

-        }

-      }

-      {

-        int i, j;

-        fprintf(f, "Inter Y-UV:\n");

-        for (i = 0; i < VP9_YMODES; i++) {

-          fprintf(f, "%2d:", i);

-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);

-          fprintf(f, "\n");

-        }

-      }

-      {

-        int i;

-        fprintf(f, "B: ");

-        for (i = 0; i < VP9_NKF_BINTRAMODES; i++)

-          fprintf(f, "%8d, ", b_modes[i]);

-        fprintf(f, "\n");

-      }

-      fprintf(f, "Modes in Inter Frames:\n");

-      {

-        int i;

-        fprintf(f, "Y: ");

-        for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "UV: ");

-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "B: ");

-        for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);

-        fprintf(f, "\n");

-      }

-      fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);

-      fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);

-      fclose(f);

-    }

-#endif

 #ifdef ENTROPY_STATS

       int i, j, k;

@@ -1827,18 +1607,18 @@

       fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");

       fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");

-      fprintf(fmode, "[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES]"

-                     "[VP9_KF_BINTRAMODES] =\n{\n");

+      fprintf(fmode, "[VP9_INTRA_MODES][VP9_INTRA_MODES]"

+                     "[VP9_INTRA_MODES] =\n{\n");

-      for (i = 0; i < VP9_KF_BINTRAMODES; i++) {

+      for (i = 0; i < VP9_INTRA_MODES; i++) {

         fprintf(fmode, "    { // Above Mode :  %d\n", i);

-        for (j = 0; j < VP9_KF_BINTRAMODES; j++) {

+        for (j = 0; j < VP9_INTRA_MODES; j++) {

           fprintf(fmode, "        {");

-          for (k = 0; k < VP9_KF_BINTRAMODES; k++) {

+          for (k = 0; k < VP9_INTRA_MODES; k++) {

             if (!intra_mode_stats[i][j][k])

               fprintf(fmode, " %5d, ", 1);

             else

@@ -1988,8 +1768,8 @@

   pkt.data.psnr.samples[0] = width * height;

   pkt.data.psnr.samples[1] = width * height;

-  width = (width + 1) / 2;

-  height = (height + 1) / 2;

+  width = orig->uv_width;

+  height = orig->uv_height;

   sse = calc_plane_error(orig->u_buffer, orig->uv_stride,

                          recon->u_buffer, recon->uv_stride,

@@ -2098,10 +1878,7 @@

   return 0;

 int vp9_update_entropy(VP9_PTR comp, int update) {

-  VP9_COMP *cpi = (VP9_COMP *) comp;

-  VP9_COMMON *cm = &cpi->common;

-  cm->refresh_entropy_probs = update;

+  ((VP9_COMP *)comp)->common.refresh_frame_context = update;

   return 0;

@@ -2146,7 +1923,7 @@

   } while (--h);

   src = s->u_buffer;

-  h = (cm->height + 1) / 2;

+  h = s->uv_height;

   do {

     fwrite(src, s->uv_width, 1,  yuv_rec_file);

@@ -2154,12 +1931,24 @@

   } while (--h);

   src = s->v_buffer;

-  h = (cm->height + 1) / 2;

+  h = s->uv_height;

   do {

     fwrite(src, s->uv_width, 1, yuv_rec_file);

     src += s->uv_stride;

   } while (--h);

+#if CONFIG_ALPHA

+  if (s->alpha_buffer) {

+    src = s->alpha_buffer;

+    h = s->alpha_height;

+    do {

+      fwrite(src, s->alpha_width, 1,  yuv_rec_file);

+      src += s->alpha_stride;

+    } while (--h);

+  }

+#endif

   fflush(yuv_rec_file);

 #endif

@@ -2170,56 +1959,35 @@

   const int in_h = src_fb->y_crop_height;

   const int out_w = dst_fb->y_crop_width;

   const int out_h = dst_fb->y_crop_height;

-  int x, y;

+  int x, y, i;

+  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,

+                      src_fb->alpha_buffer};

+  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,

+                        src_fb->alpha_stride};

+  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,

+                      dst_fb->alpha_buffer};

+  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,

+                        dst_fb->alpha_stride};

   for (y = 0; y < out_h; y += 16) {

     for (x = 0; x < out_w; x += 16) {

-      int x_q4 = x * 16 * in_w / out_w;

-      int y_q4 = y * 16 * in_h / out_h;

-      uint8_t *src, *dst;

-      int src_stride, dst_stride;

+      for (i = 0; i < MAX_MB_PLANE; ++i) {

+        const int factor = i == 0 ? 1 : 2;

+        const int x_q4 = x * (16 / factor) * in_w / out_w;

+        const int y_q4 = y * (16 / factor) * in_h / out_h;

+        const int src_stride = src_strides[i];

+        const int dst_stride = dst_strides[i];

+        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +

+                                 x / factor * in_w / out_w;

+        uint8_t *dst = dsts[i] + y * dst_stride + x;

-      src = src_fb->y_buffer +

-          y * in_h / out_h * src_fb->y_stride +

-          x * in_w / out_w;

-      dst = dst_fb->y_buffer +

-          y * dst_fb->y_stride +

-          x;

-      src_stride = src_fb->y_stride;

-      dst_stride = dst_fb->y_stride;

-      vp9_convolve8(src, src_stride, dst, dst_stride,

-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,

-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,

-                    16, 16);

-      x_q4 >>= 1;

-      y_q4 >>= 1;

-      src_stride = src_fb->uv_stride;

-      dst_stride = dst_fb->uv_stride;

-      src = src_fb->u_buffer +

-          y / 2 * in_h / out_h * src_fb->uv_stride +

-          x / 2 * in_w / out_w;

-      dst = dst_fb->u_buffer +

-          y / 2 * dst_fb->uv_stride +

-          x / 2;

-      vp9_convolve8(src, src_stride, dst, dst_stride,

-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,

-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,

-                    8, 8);

-      src = src_fb->v_buffer +

-          y / 2 * in_h / out_h * src_fb->uv_stride +

-          x / 2 * in_w / out_w;

-      dst = dst_fb->v_buffer +

-          y / 2 * dst_fb->uv_stride +

-          x / 2;

-      vp9_convolve8(src, src_stride, dst, dst_stride,

-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,

-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,

-                    8, 8);

+        vp9_convolve8(src, src_stride, dst, dst_stride,

+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,

+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,

+                      16 / factor, 16 / factor);

+      }

@@ -2228,62 +1996,35 @@

 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  // Update data structure that monitors level of reference to last GF

-  vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

   // this frame refreshes means next frames don't unless specified by user

   cpi->common.frames_since_golden = 0;

-  // Clear the alternate reference update pending flag.

-  cpi->source_alt_ref_pending = FALSE;

+#if CONFIG_MULTIPLE_ARF

+  if (!cpi->multi_arf_enabled)

+#endif

+    // Clear the alternate reference update pending flag.

+    cpi->source_alt_ref_pending = 0;

-  // Set the alternate refernce frame active flag

-  cpi->source_alt_ref_active = TRUE;

+  // Set the alternate reference frame active flag

+  cpi->source_alt_ref_active = 1;

 static void update_golden_frame_stats(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

   // Update the Golden frame usage counts.

   if (cpi->refresh_golden_frame) {

-    // Update data structure that monitors level of reference to last GF

-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

     // this frame refreshes means next frames don't unless specified by user

     cpi->refresh_golden_frame = 0;

     cpi->common.frames_since_golden = 0;

-    // if ( cm->frame_type == KEY_FRAME )

-    // {

-    cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;

-    cpi->recent_ref_frame_usage[LAST_FRAME] = 1;

-    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;

-    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;

-    // }

-    // else

-    // {

-    //  // Carry a potrtion of count over to begining of next gf sequence

-    //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;

-    //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;

-    //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;

-    //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;

-    // }

     // ******** Fixed Q test code only ************

     // If we are going to use the ALT reference for the next group of frames set a flag to say so.

     if (cpi->oxcf.fixed_q >= 0 &&

         cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {

-      cpi->source_alt_ref_pending = TRUE;

+      cpi->source_alt_ref_pending = 1;

       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

     if (!cpi->source_alt_ref_pending)

-      cpi->source_alt_ref_active = FALSE;

+      cpi->source_alt_ref_active = 0;

     // Decrement count down till next gf

     if (cpi->frames_till_gf_update_due > 0)

@@ -2298,13 +2039,6 @@

       cpi->common.frames_till_alt_ref_frame--;

     cpi->common.frames_since_golden++;

-    if (cpi->common.frames_since_golden > 1) {

-      cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];

-      cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];

-      cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];

-      cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];

-    }

@@ -2384,7 +2118,8 @@

       int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);

       h = (h < 0 ? -h : h);

       v = (v < 0 ? -v : v);

-      if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;

+      if (h > EDGE_THRESH || v > EDGE_THRESH)

+        num_edge_pels++;

       curr++;

       prev++;

       next++;

@@ -2393,7 +2128,7 @@

     prev += frame->y_stride - frame->y_width + 2;

     next += frame->y_stride - frame->y_width + 2;

-  return (double)num_edge_pels / (double)num_pels;

+  return (double)num_edge_pels / num_pels;

 // Function to test for conditions that indicate we should loop

@@ -2401,11 +2136,11 @@

 static int recode_loop_test(VP9_COMP *cpi,

                             int high_limit, int low_limit,

                             int q, int maxq, int minq) {

-  int force_recode = FALSE;

+  int force_recode = 0;

   VP9_COMMON *cm = &cpi->common;

   // Is frame recode allowed at all

-  // Yes if either recode mode 1 is selected or mode two is selcted

+  // Yes if either recode mode 1 is selected or mode two is selected

   // and the frame is a key frame. golden frame or alt_ref_frame

   if ((cpi->sf.recode_loop == 1) ||

       ((cpi->sf.recode_loop == 2) &&

@@ -2415,21 +2150,19 @@

     // General over and under shoot tests

     if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||

         ((cpi->projected_frame_size < low_limit) && (q > minq))) {

-      force_recode = TRUE;

+      force_recode = 1;

     // Special Constrained quality tests

     else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

       // Undershoot and below auto cq level

-      if ((q > cpi->cq_target_quality) &&

-          (cpi->projected_frame_size <

-           ((cpi->this_frame_target * 7) >> 3))) {

-        force_recode = TRUE;

-      }

-      // Severe undershoot and between auto and user cq level

-      else if ((q > cpi->oxcf.cq_level) &&

-               (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&

-               (cpi->active_best_quality > cpi->oxcf.cq_level)) {

-        force_recode = TRUE;

+      if (q > cpi->cq_target_quality &&

+          cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {

+        force_recode = 1;

+      } else if (q > cpi->oxcf.cq_level &&

+                 cpi->projected_frame_size < cpi->min_frame_bandwidth &&

+                 cpi->active_best_quality > cpi->oxcf.cq_level) {

+        // Severe undershoot and between auto and user cq level

+        force_recode = 1;

         cpi->active_best_quality = cpi->oxcf.cq_level;

@@ -2448,13 +2181,19 @@

                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);

     ref_cnt_fb(cm->fb_idx_ref_cnt,

                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);

-  } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

+  }

+#if CONFIG_MULTIPLE_ARF

+  else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&

+      !cpi->refresh_alt_ref_frame) {

+#else

+  else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

+#endif

     /* Preserve the previously existing golden frame and update the frame in

      * the alt ref slot instead. This is highly specific to the current use of

      * alt-ref as a forward reference, and this needs to be generalized as

      * other uses are implemented (like RTC/temporal scaling)

-     * The update to the buffer in the alt ref slot was signalled in

+     * The update to the buffer in the alt ref slot was signaled in

      * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated

      * as the golden frame next time.

*/

@@ -2466,10 +2205,16 @@

     tmp = cpi->alt_fb_idx;

     cpi->alt_fb_idx = cpi->gld_fb_idx;

     cpi->gld_fb_idx = tmp;

-  } else { /* For non key/golden frames */

+  }  else { /* For non key/golden frames */

     if (cpi->refresh_alt_ref_frame) {

+      int arf_idx = cpi->alt_fb_idx;

+#if CONFIG_MULTIPLE_ARF

+      if (cpi->multi_arf_enabled) {

+        arf_idx = cpi->arf_buffer_idx[cpi->sequence_number + 1];

+      }

+#endif

       ref_cnt_fb(cm->fb_idx_ref_cnt,

-                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);

+                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);

     if (cpi->refresh_golden_frame) {

@@ -2485,7 +2230,7 @@

 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {

-  if (cm->no_lpf || cpi->mb.e_mbd.lossless) {

+  if (cpi->mb.e_mbd.lossless) {

     cm->filter_level = 0;

   } else {

     struct vpx_usec_timer timer;

@@ -2493,11 +2238,9 @@

     vp9_clear_system_state();

     vpx_usec_timer_start(&timer);

-    if (cpi->sf.auto_filter == 0)

-      vp9_pick_filter_level_fast(cpi->Source, cpi);

-    else

-      vp9_pick_filter_level(cpi->Source, cpi);

+    vp9_pick_filter_level(cpi->Source, cpi);

     vpx_usec_timer_mark(&timer);

     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);

@@ -2504,11 +2247,11 @@

   if (cm->filter_level > 0) {

     vp9_set_alt_lf_level(cpi, cm->filter_level);

-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0,

-                          cm->dering_enabled);

+    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0);

-  vp8_yv12_extend_frame_borders(cm->frame_to_show);

+  vp9_extend_frame_borders(cm->frame_to_show,

+                           cm->subsampling_x, cm->subsampling_y);

@@ -2551,20 +2294,6 @@

-#if CONFIG_COMP_INTERINTRA_PRED

-static void select_interintra_mode(VP9_COMP *cpi) {

-  static const double threshold = 0.01;

-  VP9_COMMON *cm = &cpi->common;

-  // FIXME(debargha): Make this RD based

-  int sum = cpi->interintra_select_count[1] + cpi->interintra_select_count[0];

-  if (sum) {

-    double fraction = (double) cpi->interintra_select_count[1] / sum;

-    // printf("fraction: %f\n", fraction);

-    cm->use_interintra = (fraction > threshold);

-  }

-}

-#endif

 static void scale_references(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

   int i;

@@ -2576,9 +2305,10 @@

         ref->y_crop_height != cm->height) {

       int new_fb = get_free_fb(cm);

-      vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],

-                                    cm->width, cm->height,

-                                    VP9BORDERINPIXELS);

+      vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb],

+                               cm->width, cm->height,

+                               cm->subsampling_x, cm->subsampling_y,

+                               VP9BORDERINPIXELS);

       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);

       cpi->scaled_ref_idx[i] = new_fb;

     } else {

@@ -2592,9 +2322,8 @@

   VP9_COMMON *cm = &cpi->common;

   int i;

-  for (i = 0; i < 3; i++) {

+  for (i = 0; i < 3; i++)

     cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;

-  }

 static void encode_frame_to_data_rate(VP9_COMP *cpi,

@@ -2603,12 +2332,12 @@

                                       unsigned int *frame_flags) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  int Q;

+  TX_SIZE t;

+  int q;

   int frame_over_shoot_limit;

   int frame_under_shoot_limit;

-  int Loop = FALSE;

+  int loop = 0;

   int loop_count;

   int q_low;

@@ -2616,10 +2345,10 @@

   int top_index;

   int bottom_index;

-  int active_worst_qchanged = FALSE;

+  int active_worst_qchanged = 0;

-  int overshoot_seen = FALSE;

-  int undershoot_seen = FALSE;

+  int overshoot_seen = 0;

+  int undershoot_seen = 0;

   SPEED_FEATURES *sf = &cpi->sf;

 #if RESET_FOREACH_FILTER

@@ -2634,11 +2363,7 @@

   /* list of filters to search over */

   int mcomp_filters_to_search[] = {

-#if CONFIG_ENABLE_6TAP

-      EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE

-#else

-      EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE

-#endif

+    EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE

};

   int mcomp_filters = sizeof(mcomp_filters_to_search) /

       sizeof(*mcomp_filters_to_search);

@@ -2646,8 +2371,8 @@

   int64_t mcomp_filter_cost[4];

   /* Scale the source buffer, if required */

-  if (cm->mb_cols * 16 != cpi->un_scaled_source->y_width ||

-      cm->mb_rows * 16 != cpi->un_scaled_source->y_height) {

+  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||

+      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {

     scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);

     cpi->Source = &cpi->scaled_source;

   } else {

@@ -2663,7 +2388,8 @@

   // For an alt ref frame in 2 pass we skip the call to the second

   // pass function that sets the target bandwidth so must set it here

   if (cpi->refresh_alt_ref_frame) {

-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame

+    // Per frame bit target for the alt ref frame

+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;

     // per second target bitrate

     cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *

                                   cpi->output_frame_rate);

@@ -2678,17 +2404,14 @@

   cpi->zbin_mode_boost = 0;

   // if (cpi->oxcf.lossless)

-    cpi->zbin_mode_boost_enabled = FALSE;

+    cpi->zbin_mode_boost_enabled = 0;

   // else

-  //   cpi->zbin_mode_boost_enabled = TRUE;

+  //   cpi->zbin_mode_boost_enabled = 1;

   // Current default encoder behaviour for the altref sign bias

-  if (cpi->source_alt_ref_active)

-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;

-  else

-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;

+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;

-  // Check to see if a key frame is signalled

+  // Check to see if a key frame is signaled

   // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.

   if ((cm->current_video_frame == 0) ||

       (cm->frame_flags & FRAMEFLAGS_KEY) ||

@@ -2715,12 +2438,11 @@

     // The alternate reference frame cannot be active for a key frame

-    cpi->source_alt_ref_active = FALSE;

+    cpi->source_alt_ref_active = 0;

     // Reset the RD threshold multipliers to default of * 1 (128)

-    for (i = 0; i < MAX_MODES; i++) {

+    for (i = 0; i < MAX_MODES; i++)

       cpi->rd_thresh_mult[i] = 128;

-    }

     cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);

     cm->frame_parallel_decoding_mode =

@@ -2727,13 +2449,15 @@

       (cpi->oxcf.frame_parallel_decoding_mode != 0);

     if (cm->error_resilient_mode) {

       cm->frame_parallel_decoding_mode = 1;

-      cm->refresh_entropy_probs = 0;

+      cm->reset_frame_context = 0;

+      cm->refresh_frame_context = 0;

-  // Configure use of segmentation for enhanced coding of static regions.

+  // Configure experimental use of segmentation for enhanced coding of

+  // static regions if indicated.

   // Only allowed for now in second pass of two pass (as requires lagged coding)

-  // and if the relevent speed feature flag is set.

+  // and if the relevant speed feature flag is set.

   if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {

     configure_static_seg_features(cpi);

@@ -2744,31 +2468,10 @@

   vp9_clear_system_state();

   // Set an active best quality and if necessary active worst quality

-  Q = cpi->active_worst_quality;

+  q = cpi->active_worst_quality;

   if (cm->frame_type == KEY_FRAME) {

-    int high = 2000;

-    int low = 400;

-    if (cpi->kf_boost > high)

-      cpi->active_best_quality = kf_low_motion_minq[Q];

-    else if (cpi->kf_boost < low)

-      cpi->active_best_quality = kf_high_motion_minq[Q];

-    else {

-      int gap = high - low;

-      int offset = high - cpi->kf_boost;

-      int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];

-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

-      cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;

-    }

-    // Make an adjustment based on the %s static

-    // The main impact of this is at lower Q to prevent overly large key

-    // frames unless a lot of the image is static.

-    if (cpi->kf_zeromotion_pct < 64)

-      cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);

+#if !CONFIG_MULTIPLE_ARF

     // Special case for key frames forced because we have reached

     // the maximum key frame interval. Here force the Q to a range

     // based on the ambient Q to reduce the risk of popping

@@ -2775,14 +2478,54 @@

     if (cpi->this_key_frame_forced) {

       int delta_qindex;

       int qindex = cpi->last_boosted_qindex;

+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);

-      delta_qindex = compute_qdelta(cpi, qindex,

-                                    (qindex * 0.75));

+      delta_qindex = compute_qdelta(cpi, last_boosted_q,

+                                    (last_boosted_q * 0.75));

-      cpi->active_best_quality = qindex + delta_qindex;

-      if (cpi->active_best_quality < cpi->best_quality)

-        cpi->active_best_quality = cpi->best_quality;

+      cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality);

+    } else {

+      int high = 5000;

+      int low = 400;

+      double q_adj_factor = 1.0;

+      double q_val;

+      // Baseline value derived from cpi->active_worst_quality and kf boost

+      if (cpi->kf_boost > high) {

+        cpi->active_best_quality = kf_low_motion_minq[q];

+      } else if (cpi->kf_boost < low) {

+        cpi->active_best_quality = kf_high_motion_minq[q];

+      } else {

+        const int gap = high - low;

+        const int offset = high - cpi->kf_boost;

+        const int qdiff = kf_high_motion_minq[q] - kf_low_motion_minq[q];

+        const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

+        cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;

+      }

+      // Allow somewhat lower kf minq with small image formats.

+      if ((cm->width * cm->height) <= (352 * 288)) {

+        q_adj_factor -= 0.25;

+      }

+      // Make a further adjustment based on the kf zero motion measure.

+      q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);

+      // Convert the adjustment factor to a qindex delta on active_best_quality.

+      q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);

+      cpi->active_best_quality +=

+        compute_qdelta(cpi, q_val, (q_val * q_adj_factor));

+#else

+    double current_q;

+    // Force the KF quantizer to be 30% of the active_worst_quality.

+    current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);

+    cpi->active_best_quality = cpi->active_worst_quality

+        + compute_qdelta(cpi, current_q, current_q * 0.3);

+#endif

   } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {

     int high = 2000;

     int low = 400;

@@ -2790,47 +2533,45 @@

     // Use the lower of cpi->active_worst_quality and recent

     // average Q as basis for GF/ARF Q limit unless last frame was

     // a key frame.

-    if ((cpi->frames_since_key > 1) &&

-        (cpi->avg_frame_qindex < cpi->active_worst_quality)) {

-      Q = cpi->avg_frame_qindex;

+    if (cpi->frames_since_key > 1 &&

+        cpi->avg_frame_qindex < cpi->active_worst_quality) {

+      q = cpi->avg_frame_qindex;

     // For constrained quality dont allow Q less than the cq level

-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

-        (Q < cpi->cq_target_quality)) {

-      Q = cpi->cq_target_quality;

+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&

+        q < cpi->cq_target_quality) {

+      q = cpi->cq_target_quality;

-    if (cpi->gfu_boost > high)

-      cpi->active_best_quality = gf_low_motion_minq[Q];

-    else if (cpi->gfu_boost < low)

-      cpi->active_best_quality = gf_high_motion_minq[Q];

-    else {

-      int gap = high - low;

-      int offset = high - cpi->gfu_boost;

-      int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];

-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

+    if (cpi->gfu_boost > high) {

+      cpi->active_best_quality = gf_low_motion_minq[q];

+    } else if (cpi->gfu_boost < low) {

+      cpi->active_best_quality = gf_high_motion_minq[q];

+    } else {

+      const int gap = high - low;

+      const int offset = high - cpi->gfu_boost;

+      const int qdiff = gf_high_motion_minq[q] - gf_low_motion_minq[q];

+      const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

-      cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;

+      cpi->active_best_quality = gf_low_motion_minq[q] + adjustment;

     // Constrained quality use slightly lower active best.

-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

-      cpi->active_best_quality =

-        cpi->active_best_quality * 15 / 16;

-    }

+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)

+      cpi->active_best_quality = cpi->active_best_quality * 15 / 16;

   } else {

 #ifdef ONE_SHOT_Q_ESTIMATE

 #ifdef STRICT_ONE_SHOT_Q

-    cpi->active_best_quality = Q;

+    cpi->active_best_quality = q;

 #else

-    cpi->active_best_quality = inter_minq[Q];

+    cpi->active_best_quality = inter_minq[q];

 #endif

 #else

-    cpi->active_best_quality = inter_minq[Q];

+    cpi->active_best_quality = inter_minq[q];

 #endif

-    // For the constant/constrained quality mode we dont want

+    // For the constant/constrained quality mode we don't want

     // q to fall below the cq level.

     if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

         (cpi->active_best_quality < cpi->cq_target_quality)) {

@@ -2859,22 +2600,45 @@

   // Special case code to try and match quality with forced key frames

   if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {

-    Q = cpi->last_boosted_qindex;

+    q = cpi->last_boosted_qindex;

   } else {

     // Determine initial Q to try

-    Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+    q = vp9_regulate_q(cpi, cpi->this_frame_target);

   vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,

                                 &frame_over_shoot_limit);

-  // Limit Q range for the adaptive loop.

-  bottom_index = cpi->active_best_quality;

-  top_index    = cpi->active_worst_quality;

-  q_low  = cpi->active_best_quality;

-  q_high = cpi->active_worst_quality;

+#if CONFIG_MULTIPLE_ARF

+  // Force the quantizer determined by the coding order pattern.

+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME)) {

+    double new_q;

+    double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);

+    int level = cpi->this_frame_weight;

+    assert(level >= 0);

+    // Set quantizer steps at 10% increments.

+    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));

+    q = cpi->active_worst_quality + compute_qdelta(cpi, current_q, new_q);

+    bottom_index = q;

+    top_index    = q;

+    q_low  = q;

+    q_high = q;

+    printf("frame:%d q:%d\n", cm->current_video_frame, q);

+  } else {

+#endif

+    // Limit Q range for the adaptive loop.

+    bottom_index = cpi->active_best_quality;

+    top_index    = cpi->active_worst_quality;

+    q_low  = cpi->active_best_quality;

+    q_high = cpi->active_worst_quality;

+#if CONFIG_MULTIPLE_ARF

+  }

+#endif

   loop_count = 0;

+  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));

   if (cm->frame_type != KEY_FRAME) {

     /* TODO: Decide this more intelligently */

@@ -2885,16 +2649,10 @@

       cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;

     /* TODO: Decide this more intelligently */

-    xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);

+    xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;

     set_mvcost(&cpi->mb);

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (cm->current_video_frame == 0) {

-    cm->use_interintra = 1;

-  }

-#endif

 #if CONFIG_POSTPROC

   if (cpi->oxcf.noise_sensitivity > 0) {

@@ -2919,7 +2677,7 @@

         break;

-    vp9_denoise(cpi->Source, cpi->Source, l, 1, 0);

+    vp9_denoise(cpi->Source, cpi->Source, l);

 #endif

@@ -2942,66 +2700,23 @@

   do {

     vp9_clear_system_state();  // __asm emms;

-    vp9_set_quantizer(cpi, Q);

+    vp9_set_quantizer(cpi, q);

     if (loop_count == 0) {

-      // setup skip prob for costing in mode/mv decision

-      if (cpi->common.mb_no_coeff_skip) {

-        int k;

-        for (k = 0; k < MBSKIP_CONTEXTS; k++)

-          cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];

-        if (cm->frame_type != KEY_FRAME) {

-          if (cpi->refresh_alt_ref_frame) {

-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

-              if (cpi->last_skip_false_probs[2][k] != 0)

-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];

-            }

-          } else if (cpi->refresh_golden_frame) {

-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

-              if (cpi->last_skip_false_probs[1][k] != 0)

-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];

-            }

-          } else {

-            int k;

-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

-              if (cpi->last_skip_false_probs[0][k] != 0)

-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];

-            }

-          }

-          // as this is for cost estimate, let's make sure it does not

-          // get extreme either way

-          {

-            int k;

-            for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

-              if (cm->mbskip_pred_probs[k] < 5)

-                cm->mbskip_pred_probs[k] = 5;

-              if (cm->mbskip_pred_probs[k] > 250)

-                cm->mbskip_pred_probs[k] = 250;

-              if (cpi->is_src_frame_alt_ref)

-                cm->mbskip_pred_probs[k] = 1;

-            }

-          }

-        }

-      }

       // Set up entropy depending on frame type.

       if (cm->frame_type == KEY_FRAME) {

         /* Choose which entropy context to use. When using a forward reference

-	 * frame, it immediately follows the keyframe, and thus benefits from

-	 * using the same entropy context established by the keyframe. Otherwise,

-	 * use the default context 0.

-	 */

+         * frame, it immediately follows the keyframe, and thus benefits from

+         * using the same entropy context established by the keyframe.

+         *  Otherwise, use the default context 0.

+         */

         cm->frame_context_idx = cpi->oxcf.play_alternate;

         vp9_setup_key_frame(cpi);

       } else {

-	/* Choose which entropy context to use. Currently there are only two

-	 * contexts used, one for normal frames and one for alt ref frames.

-	 */

+        /* Choose which entropy context to use. Currently there are only two

+         * contexts used, one for normal frames and one for alt ref frames.

+         */

         cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;

         vp9_setup_inter_frame(cpi);

@@ -3008,16 +2723,12 @@

     // transform / motion compensation build reconstruction frame

-#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS

-    if (cm->frame_type == KEY_FRAME)

-      vp9_adjust_default_coef_probs(cm);

-#endif

     vp9_encode_frame(cpi);

     // Update the skip mb flag probabilities based on the distribution

     // seen in the last encoder iteration.

-    update_base_skip_probs(cpi);

+    // update_base_skip_probs(cpi);

     vp9_clear_system_state();  // __asm emms;

@@ -3032,61 +2743,55 @@

     if (frame_over_shoot_limit == 0)

       frame_over_shoot_limit = 1;

-    active_worst_qchanged = FALSE;

+    active_worst_qchanged = 0;

     // Special case handling for forced key frames

     if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {

-      int last_q = Q;

+      int last_q = q;

       int kf_err = vp9_calc_ss_err(cpi->Source,

                                    &cm->yv12_fb[cm->new_fb_idx]);

       int high_err_target = cpi->ambient_err;

-      int low_err_target = (cpi->ambient_err >> 1);

+      int low_err_target = cpi->ambient_err >> 1;

       // Prevent possible divide by zero error below for perfect KF

-      kf_err += (!kf_err);

+      kf_err += !kf_err;

       // The key frame is not good enough or we can afford

       // to make it better without undue risk of popping.

-      if (((kf_err > high_err_target) &&

-           (cpi->projected_frame_size <= frame_over_shoot_limit)) ||

-          ((kf_err > low_err_target) &&

-           (cpi->projected_frame_size <= frame_under_shoot_limit))) {

+      if ((kf_err > high_err_target &&

+           cpi->projected_frame_size <= frame_over_shoot_limit) ||

+          (kf_err > low_err_target &&

+           cpi->projected_frame_size <= frame_under_shoot_limit)) {

         // Lower q_high

-        q_high = (Q > q_low) ? (Q - 1) : q_low;

+        q_high = q > q_low ? q - 1 : q_low;

         // Adjust Q

-        Q = (Q * high_err_target) / kf_err;

-        if (Q < ((q_high + q_low) >> 1))

-          Q = (q_high + q_low) >> 1;

-      }

-      // The key frame is much better than the previous frame

-      else if ((kf_err < low_err_target) &&

-               (cpi->projected_frame_size >= frame_under_shoot_limit)) {

+        q = (q * high_err_target) / kf_err;

+        q = MIN(q, (q_high + q_low) >> 1);

+      } else if (kf_err < low_err_target &&

+                cpi->projected_frame_size >= frame_under_shoot_limit) {

+        // The key frame is much better than the previous frame

         // Raise q_low

-        q_low = (Q < q_high) ? (Q + 1) : q_high;

+        q_low = q < q_high ? q + 1 : q_high;

         // Adjust Q

-        Q = (Q * low_err_target) / kf_err;

-        if (Q > ((q_high + q_low + 1) >> 1))

-          Q = (q_high + q_low + 1) >> 1;

+        q = (q * low_err_target) / kf_err;

+        q = MIN(q, (q_high + q_low + 1) >> 1);

       // Clamp Q to upper and lower limits:

-      if (Q > q_high)

-        Q = q_high;

-      else if (Q < q_low)

-        Q = q_low;

+      q = clamp(q, q_low, q_high);

-      Loop = ((Q != last_q)) ? TRUE : FALSE;

+      loop = q != last_q;

     // Is the projected frame size out of range and are we allowed to attempt to recode.

     else if (recode_loop_test(cpi,

                               frame_over_shoot_limit, frame_under_shoot_limit,

-                              Q, top_index, bottom_index)) {

-      int last_q = Q;

-      int Retries = 0;

+                              q, top_index, bottom_index)) {

+      int last_q = q;

+      int retries = 0;

       // Frame size out of permitted range:

       // Update correction factor & compute new Q to try...

@@ -3093,77 +2798,78 @@

       // Frame is too large

       if (cpi->projected_frame_size > cpi->this_frame_target) {

-        q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value

+        // Raise Qlow as to at least the current value

+        q_low = q < q_high ? q + 1 : q_high;

-        if (undershoot_seen || (loop_count > 1)) {

-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

+        if (undershoot_seen || loop_count > 1) {

+          // Update rate_correction_factor unless cpi->active_worst_quality

+          // has changed.

           if (!active_worst_qchanged)

             vp9_update_rate_correction_factors(cpi, 1);

-          Q = (q_high + q_low + 1) / 2;

+          q = (q_high + q_low + 1) / 2;

         } else {

           // Update rate_correction_factor unless cpi->active_worst_quality has changed.

           if (!active_worst_qchanged)

             vp9_update_rate_correction_factors(cpi, 0);

-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+          q = vp9_regulate_q(cpi, cpi->this_frame_target);

-          while ((Q < q_low) && (Retries < 10)) {

+          while (q < q_low && retries < 10) {

             vp9_update_rate_correction_factors(cpi, 0);

-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-            Retries++;

+            q = vp9_regulate_q(cpi, cpi->this_frame_target);

+            retries++;

-        overshoot_seen = TRUE;

-      }

-      // Frame is too small

-      else {

-        q_high = (Q > q_low) ? (Q - 1) : q_low;

+        overshoot_seen = 1;

+      } else {

+        // Frame is too small

+        q_high = q > q_low ? q - 1 : q_low;

-        if (overshoot_seen || (loop_count > 1)) {

+        if (overshoot_seen || loop_count > 1) {

           // Update rate_correction_factor unless cpi->active_worst_quality has changed.

           if (!active_worst_qchanged)

             vp9_update_rate_correction_factors(cpi, 1);

-          Q = (q_high + q_low) / 2;

+          q = (q_high + q_low) / 2;

         } else {

           // Update rate_correction_factor unless cpi->active_worst_quality has changed.

           if (!active_worst_qchanged)

             vp9_update_rate_correction_factors(cpi, 0);

-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+          q = vp9_regulate_q(cpi, cpi->this_frame_target);

           // Special case reset for qlow for constrained quality.

           // This should only trigger where there is very substantial

           // undershoot on a frame and the auto cq level is above

           // the user passsed in value.

-          if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

-              (Q < q_low)) {

-            q_low = Q;

+          if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {

+            q_low = q;

-          while ((Q > q_high) && (Retries < 10)) {

+          while (q > q_high && retries < 10) {

             vp9_update_rate_correction_factors(cpi, 0);

-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-            Retries++;

+            q = vp9_regulate_q(cpi, cpi->this_frame_target);

+            retries++;

-        undershoot_seen = TRUE;

+        undershoot_seen = 1;

       // Clamp Q to upper and lower limits:

-      Q = clamp(Q, q_low, q_high);

+      q = clamp(q, q_low, q_high);

-      Loop = Q != last_q;

-    } else

-      Loop = FALSE;

+      loop = q != last_q;

+    } else {

+      loop = 0;

+    }

     if (cpi->is_src_frame_alt_ref)

-      Loop = FALSE;

+      loop = 0;

-    if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {

+    if (!loop && cm->frame_type != KEY_FRAME && sf->search_best_filter) {

       if (mcomp_filter_index < mcomp_filters) {

         int64_t err = vp9_calc_ss_err(cpi->Source,

                                     &cm->yv12_fb[cm->new_fb_idx]);

@@ -3174,7 +2880,7 @@

         if (mcomp_filter_index < mcomp_filters) {

           cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];

           loop_count = -1;

-          Loop = TRUE;

+          loop = 1;

         } else {

           int f;

           int64_t best_cost = mcomp_filter_cost[0];

@@ -3187,7 +2893,7 @@

           if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {

             loop_count = -1;

-            Loop = TRUE;

+            loop = 1;

             cm->mcomp_filter_type = mcomp_best_filter;

/*

@@ -3197,12 +2903,12 @@

*/

 #if RESET_FOREACH_FILTER

-        if (Loop == TRUE) {

-          overshoot_seen = FALSE;

-          undershoot_seen = FALSE;

+        if (loop) {

+          overshoot_seen = 0;

+          undershoot_seen = 0;

           q_low = q_low0;

           q_high = q_high0;

-          Q = Q0;

+          q = Q0;

           cpi->rate_correction_factor = rate_correction_factor0;

           cpi->gf_rate_correction_factor = gf_rate_correction_factor0;

           cpi->active_best_quality = active_best_quality0;

@@ -3212,7 +2918,7 @@

-    if (Loop == TRUE) {

+    if (loop) {

       loop_count++;

 #if CONFIG_INTERNAL_STATS

@@ -3219,7 +2925,7 @@

       cpi->tot_recode_hits++;

 #endif

-  } while (Loop == TRUE);

+  } while (loop);

   // Special case code to reduce pulsing when key frames are forced at a

   // fixed interval. Note the reconstruction error if it is the frame before

@@ -3229,51 +2935,9 @@

                                        &cm->yv12_fb[cm->new_fb_idx]);

-  // This frame's MVs are saved and will be used in next frame's MV

-  // prediction. Last frame has one more line(add to bottom) and one

-  // more column(add to right) than cm->mip. The edge elements are

-  // initialized to 0.

-  if (cm->show_frame) { // do not save for altref frame

-    int mb_row;

-    int mb_col;

-    MODE_INFO *tmp = cm->mip;

-    if (cm->frame_type != KEY_FRAME) {

-      for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {

-        for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {

-          if (tmp->mbmi.ref_frame != INTRA_FRAME)

-            cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;

-          cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];

-          cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;

-          tmp++;

-        }

-      }

-    }

-  }

-  // Update the GF useage maps.

-  // This is done after completing the compression of a frame when all modes

-  // etc. are finalized but before loop filter

-  vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);

   if (cm->frame_type == KEY_FRAME)

     cpi->refresh_last_frame = 1;

-#if 0

-  {

-    FILE *f = fopen("gfactive.stt", "a");

-    fprintf(f, "%8d %8d %8d %8d %8d\n",

-            cm->current_video_frame,

-            (100 * cpi->gf_active_count)

-              / (cpi->common.mb_rows * cpi->common.mb_cols),

-            cpi->this_iiratio,

-            cpi->next_iiratio,

-            cpi->refresh_golden_frame);

-    fclose(f);

-  }

-#endif

   cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

 #if WRITE_RECON_BUFFER

@@ -3288,38 +2952,42 @@

   // Pick the loop filter level for the frame.

   loopfilter_frame(cpi, cm);

+#if WRITE_RECON_BUFFER

+  if (cm->show_frame)

+    write_cx_frame_to_file(cm->frame_to_show,

+                           cm->current_video_frame + 2000);

+  else

+    write_cx_frame_to_file(cm->frame_to_show,

+                           cm->current_video_frame + 3000);

+#endif

   // build the bitstream

   cpi->dummy_packing = 0;

   vp9_pack_bitstream(cpi, dest, size);

-  if (cpi->mb.e_mbd.update_mb_segmentation_map) {

+  if (xd->update_mb_segmentation_map) {

     update_reference_segmentation_map(cpi);

   release_scaled_references(cpi);

   update_reference_frames(cpi);

-  vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4);

-  vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);

-  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);

-  vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);

+  for (t = TX_4X4; t <= TX_32X32; t++)

+    vp9_full_to_model_counts(cpi->common.fc.coef_counts[t],

+                             cpi->coef_counts[t]);

   if (!cpi->common.error_resilient_mode &&

       !cpi->common.frame_parallel_decoding_mode) {

     vp9_adapt_coef_probs(&cpi->common);

-#if CONFIG_CODE_NONZEROCOUNT

-    vp9_adapt_nzc_probs(&cpi->common);

-#endif

   if (cpi->common.frame_type != KEY_FRAME) {

-    vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);

-    vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);

+    vp9_copy(cpi->common.fc.y_mode_counts, cpi->y_mode_count);

     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);

-    vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);

-    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);

-    vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);

-    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);

-#if CONFIG_COMP_INTERINTRA_PRED

-    vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);

-#endif

+    vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);

+    vp9_copy(cm->fc.intra_inter_count, cpi->intra_inter_count);

+    vp9_copy(cm->fc.comp_inter_count, cpi->comp_inter_count);

+    vp9_copy(cm->fc.single_ref_count, cpi->single_ref_count);

+    vp9_copy(cm->fc.comp_ref_count, cpi->comp_ref_count);

     cpi->common.fc.NMVcount = cpi->NMVcount;

     if (!cpi->common.error_resilient_mode &&

         !cpi->common.frame_parallel_decoding_mode) {

@@ -3328,9 +2996,9 @@

       vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (cm->frame_type != KEY_FRAME)

-    select_interintra_mode(cpi);

+#ifdef ENTROPY_STATS

+  vp9_update_mode_context_stats(cpi);

 #endif

   /* Move storing frame_type out of the above loop since it is also

@@ -3368,16 +3036,16 @@

     cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;

   // Keep a record from which we can calculate the average Q excluding GF updates and key frames

-  if ((cm->frame_type != KEY_FRAME)

-      && !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

+  if (cm->frame_type != KEY_FRAME &&

+      !cpi->refresh_golden_frame &&

+      !cpi->refresh_alt_ref_frame) {

     cpi->ni_frames++;

-    cpi->tot_q += vp9_convert_qindex_to_q(Q);

+    cpi->tot_q += vp9_convert_qindex_to_q(q);

     cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;

-    // Calculate the average Q for normal inter frames (not key or GFU

-    // frames).

-    cpi->ni_tot_qi += Q;

-    cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);

+    // Calculate the average Q for normal inter frames (not key or GFU frames).

+    cpi->ni_tot_qi += q;

+    cpi->ni_av_qi = cpi->ni_tot_qi / cpi->ni_frames;

   // Update the buffer level variable.

@@ -3406,7 +3074,7 @@

   // Actual bits spent

-  cpi->total_actual_bits    += cpi->projected_frame_size;

+  cpi->total_actual_bits += cpi->projected_frame_size;

   // Debug stats

   cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);

@@ -3417,20 +3085,18 @@

   if (cm->frame_type == KEY_FRAME) {

     cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;

-    if (cpi->twopass.kf_group_bits < 0)

-      cpi->twopass.kf_group_bits = 0;

+    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);

   } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {

     cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;

-    if (cpi->twopass.gf_group_bits < 0)

-      cpi->twopass.gf_group_bits = 0;

+    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);

   // Update the skip mb flag probabilities based on the distribution seen

   // in this frame.

-  update_base_skip_probs(cpi);

+  // update_base_skip_probs(cpi);

-#if 0  // 1 && CONFIG_INTERNAL_STATS

+#if 0 && CONFIG_INTERNAL_STATS

     FILE *f = fopen("tmp.stt", "a");

     int recon_err;

@@ -3440,7 +3106,7 @@

     recon_err = vp9_calc_ss_err(cpi->Source,

                                 &cm->yv12_fb[cm->new_fb_idx]);

-    if (cpi->twopass.total_left_stats->coded_error != 0.0)

+    if (cpi->twopass.total_left_stats.coded_error != 0.0)

       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"

               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"

               "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"

@@ -3463,9 +3129,9 @@

               cm->frame_type, cpi->gfu_boost,

               cpi->twopass.est_max_qcorrection_factor,

               (int)cpi->twopass.bits_left,

-              cpi->twopass.total_left_stats->coded_error,

+              cpi->twopass.total_left_stats.coded_error,

               (double)cpi->twopass.bits_left /

-              cpi->twopass.total_left_stats->coded_error,

+              cpi->twopass.total_left_stats.coded_error,

               cpi->tot_recode_hits, recon_err, cpi->kf_boost,

               cpi->kf_zeromotion_pct);

     else

@@ -3492,7 +3158,7 @@

               cm->frame_type, cpi->gfu_boost,

               cpi->twopass.est_max_qcorrection_factor,

               (int)cpi->twopass.bits_left,

-              cpi->twopass.total_left_stats->coded_error,

+              cpi->twopass.total_left_stats.coded_error,

               cpi->tot_recode_hits, recon_err, cpi->kf_boost,

               cpi->kf_zeromotion_pct);

@@ -3577,10 +3243,33 @@

     // Tell the caller that the frame was coded as a key frame

     *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;

-    // As this frame is a key frame  the next defaults to an inter frame.

+#if CONFIG_MULTIPLE_ARF

+    // Reset the sequence number.

+    if (cpi->multi_arf_enabled) {

+      cpi->sequence_number = 0;

+      cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;

+      cpi->new_frame_coding_order_period = -1;

+    }

+#endif

+    // As this frame is a key frame the next defaults to an inter frame.

     cm->frame_type = INTER_FRAME;

   } else {

     *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;

+#if CONFIG_MULTIPLE_ARF

+    /* Increment position in the coded frame sequence. */

+    if (cpi->multi_arf_enabled) {

+      ++cpi->sequence_number;

+      if (cpi->sequence_number >= cpi->frame_coding_order_period) {

+        cpi->sequence_number = 0;

+        cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;

+        cpi->new_frame_coding_order_period = -1;

+      }

+      cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];

+      assert(cpi->this_frame_weight >= 0);

+    }

+#endif

   // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.

@@ -3592,16 +3281,16 @@

   cm->last_width = cm->width;

   cm->last_height = cm->height;

-  // Dont increment frame counters if this was an altref buffer update not a real frame

+  // Don't increment frame counters if this was an altref buffer

+  // update not a real frame

+  cm->last_show_frame = cm->show_frame;

   if (cm->show_frame) {

-    cm->current_video_frame++;

-    cpi->frames_since_key++;

+    ++cm->current_video_frame;

+    ++cpi->frames_since_key;

   // reset to normal state now that we are done.

 #if 0

     char filename[512];

@@ -3620,11 +3309,15 @@

   if (cm->show_frame) {

     vpx_memcpy(cm->prev_mip, cm->mip,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+               cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *

+               sizeof(MODE_INFO));

   } else {

     vpx_memset(cm->prev_mip, 0,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+               cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *

+               sizeof(MODE_INFO));

+  // restore prev_mi

+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;

 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,

@@ -3662,6 +3355,15 @@

   struct vpx_usec_timer  timer;

   int                    res = 0;

+  if (!cpi->initial_width) {

+    // TODO(jkoleszar): Support 1/4 subsampling?

+    cm->subsampling_x = sd->uv_width < sd->y_width;

+    cm->subsampling_y = sd->uv_height < sd->y_height;

+    alloc_raw_frame_buffers(cpi);

+    cpi->initial_width = cm->width;

+    cpi->initial_height = cm->height;

+  }

   vpx_usec_timer_start(&timer);

   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,

                          cpi->active_map_enabled ? cpi->active_map : NULL))

@@ -3676,15 +3378,24 @@

 static int frame_is_reference(const VP9_COMP *cpi) {

   const VP9_COMMON *cm = &cpi->common;

-  const MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  const MACROBLOCKD *mb = &cpi->mb.e_mbd;

-  return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame

-         || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame

-         || cm->refresh_entropy_probs

-         || xd->mode_ref_lf_delta_update

-         || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;

+  return cm->frame_type == KEY_FRAME ||

+         cpi->refresh_last_frame ||

+         cpi->refresh_golden_frame ||

+         cpi->refresh_alt_ref_frame ||

+         cm->refresh_frame_context ||

+         mb->mode_ref_lf_delta_update ||

+         mb->update_mb_segmentation_map ||

+         mb->update_mb_segmentation_data;

+#if CONFIG_MULTIPLE_ARF

+int is_next_frame_arf(VP9_COMP *cpi) {

+  // Negative entry in frame_coding_order indicates an ARF at this position.

+  return cpi->frame_coding_order[cpi->sequence_number + 1] < 0 ? 1 : 0;

+}

+#endif

 int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,

                             unsigned long *size, unsigned char *dest,

@@ -3693,6 +3404,8 @@

   VP9_COMMON *cm = &cpi->common;

   struct vpx_usec_timer  cmptimer;

   YV12_BUFFER_CONFIG    *force_src_buffer = NULL;

+  int i;

+  // FILE *fp_out = fopen("enc_frame_type.txt", "a");

   if (!cpi)

     return -1;

@@ -3704,46 +3417,117 @@

   cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;

   set_mvcost(&cpi->mb);

-  // Should we code an alternate reference frame

-  if (cpi->oxcf.play_alternate &&

-      cpi->source_alt_ref_pending) {

-    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,

-                                          cpi->frames_till_gf_update_due))) {

+  // Should we code an alternate reference frame.

+  if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) {

+    int frames_to_arf;

+#if CONFIG_MULTIPLE_ARF

+    assert(!cpi->multi_arf_enabled ||

+           cpi->frame_coding_order[cpi->sequence_number] < 0);

+    if (cpi->multi_arf_enabled && (cpi->pass == 2))

+      frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number])

+        - cpi->next_frame_in_order;

+    else

+#endif

+      frames_to_arf = cpi->frames_till_gf_update_due;

+    assert(frames_to_arf < cpi->twopass.frames_to_key);

+    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) {

+#if CONFIG_MULTIPLE_ARF

+      cpi->alt_ref_source[cpi->arf_buffered] = cpi->source;

+#else

       cpi->alt_ref_source = cpi->source;

+#endif

       if (cpi->oxcf.arnr_max_frames > 0) {

-        vp9_temporal_filter_prepare(cpi, cpi->frames_till_gf_update_due);

+        // Produce the filtered ARF frame.

+        // TODO(agrange) merge these two functions.

+        configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,

+                              cpi->gfu_boost);

+        vp9_temporal_filter_prepare(cpi, frames_to_arf);

         force_src_buffer = &cpi->alt_ref_buffer;

-      cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;

+      cm->show_frame = 0;

+      cm->intra_only = 0;

       cpi->refresh_alt_ref_frame = 1;

       cpi->refresh_golden_frame = 0;

       cpi->refresh_last_frame = 0;

-      cm->show_frame = 0;

-      cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.

       cpi->is_src_frame_alt_ref = 0;

+      // TODO(agrange) This needs to vary depending on where the next ARF is.

+      cm->frames_till_alt_ref_frame = frames_to_arf;

+#if CONFIG_MULTIPLE_ARF

+      if (!cpi->multi_arf_enabled)

+#endif

+        cpi->source_alt_ref_pending = 0;   // Clear Pending altf Ref flag.

   if (!cpi->source) {

+#if CONFIG_MULTIPLE_ARF

+    int i;

+#endif

     if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {

       cm->show_frame = 1;

+#if CONFIG_MULTIPLE_ARF

+      // Is this frame the ARF overlay.

+      cpi->is_src_frame_alt_ref = 0;

+      for (i = 0; i < cpi->arf_buffered; ++i) {

+        if (cpi->source == cpi->alt_ref_source[i]) {

+          cpi->is_src_frame_alt_ref = 1;

+          cpi->refresh_golden_frame = 1;

+          break;

+        }

+      }

+#else

       cpi->is_src_frame_alt_ref = cpi->alt_ref_source

                                   && (cpi->source == cpi->alt_ref_source);

+#endif

       if (cpi->is_src_frame_alt_ref) {

-        cpi->refresh_last_frame = 0;

+        // Current frame is an ARF overlay frame.

+#if CONFIG_MULTIPLE_ARF

+        cpi->alt_ref_source[i] = NULL;

+#else

         cpi->alt_ref_source = NULL;

+#endif

+        // Don't refresh the last buffer for an ARF overlay frame. It will

+        // become the GF so preserve last as an alternative prediction option.

+        cpi->refresh_last_frame = 0;

+#if CONFIG_MULTIPLE_ARF

+      ++cpi->next_frame_in_order;

+#endif

   if (cpi->source) {

-    cpi->un_scaled_source =

-      cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;

+    cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer

+                                                           : &cpi->source->img;

     *time_stamp = cpi->source->ts_start;

     *time_end = cpi->source->ts_end;

     *frame_flags = cpi->source->flags;

+    // fprintf(fp_out, "   Frame:%d", cm->current_video_frame);

+#if CONFIG_MULTIPLE_ARF

+    if (cpi->multi_arf_enabled) {

+      // fprintf(fp_out, "   seq_no:%d  this_frame_weight:%d",

+      //         cpi->sequence_number, cpi->this_frame_weight);

+    } else {

+      // fprintf(fp_out, "\n");

+    }

+#else

+    // fprintf(fp_out, "\n");

+#endif

+#if CONFIG_MULTIPLE_ARF

+    if ((cm->frame_type != KEY_FRAME) && (cpi->pass == 2))

+      cpi->source_alt_ref_pending = is_next_frame_arf(cpi);

+#endif

   } else {

     *size = 0;

     if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {

@@ -3751,6 +3535,7 @@

       cpi->twopass.first_pass_done = 1;

+    // fclose(fp_out);

     return -1;

@@ -3768,11 +3553,11 @@

       this_duration = cpi->source->ts_end - cpi->source->ts_start;

       step = 1;

     } else {

-      int64_t last_duration;

+      int64_t last_duration = cpi->last_end_time_stamp_seen

+                                - cpi->last_time_stamp_seen;

       this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;

-      last_duration = cpi->last_end_time_stamp_seen

-                      - cpi->last_time_stamp_seen;

       // do a step update if the duration changes by 10%

       if (last_duration)

         step = (int)((this_duration - last_duration) * 10 / last_duration);

@@ -3779,21 +3564,15 @@

     if (this_duration) {

-      if (step)

+      if (step) {

         vp9_new_frame_rate(cpi, 10000000.0 / this_duration);

-      else {

-        double avg_duration, interval;

-        /* Average this frame's rate into the last second's average

-         * frame rate. If we haven't seen 1 second yet, then average

-         * over the whole interval seen.

-         */

-        interval = (double)(cpi->source->ts_end

-                            - cpi->first_time_stamp_ever);

-        if (interval > 10000000.0)

-          interval = 10000000;

-        avg_duration = 10000000.0 / cpi->oxcf.frame_rate;

+      } else {

+        // Average this frame's rate into the last second's average

+        // frame rate. If we haven't seen 1 second yet, then average

+        // over the whole interval seen.

+        const double interval = MIN((double)(cpi->source->ts_end

+                                     - cpi->first_time_stamp_ever), 10000000.0);

+        double avg_duration = 10000000.0 / cpi->oxcf.frame_rate;

         avg_duration *= (interval - avg_duration + this_duration);

         avg_duration /= interval;

@@ -3811,22 +3590,6 @@

   // Clear down mmx registers

   vp9_clear_system_state();  // __asm emms;

-  cm->frame_type = INTER_FRAME;

-  cm->frame_flags = *frame_flags;

-#if 0

-  if (cpi->refresh_alt_ref_frame) {

-    // cpi->refresh_golden_frame = 1;

-    cpi->refresh_golden_frame = 0;

-    cpi->refresh_last_frame = 0;

-  } else {

-    cpi->refresh_golden_frame = 0;

-    cpi->refresh_last_frame = 1;

-  }

-#endif

   /* find a free buffer for the new frame, releasing the reference previously

    * held.

*/

@@ -3833,17 +3596,50 @@

   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

   cm->new_fb_idx = get_free_fb(cm);

+#if CONFIG_MULTIPLE_ARF

+  /* Set up the correct ARF frame. */

+  if (cpi->refresh_alt_ref_frame) {

+    ++cpi->arf_buffered;

+  }

+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&

+      (cpi->pass == 2)) {

+    cpi->alt_fb_idx = cpi->arf_buffer_idx[cpi->sequence_number];

+  }

+#endif

   /* Get the mapping of L/G/A to the reference buffer pool */

   cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx];

   cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx];

   cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx];

-  /* Reset the frame pointers to the current frame size */

-  vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],

-                                cm->width, cm->height,

-                                VP9BORDERINPIXELS);

+#if 0  // CONFIG_MULTIPLE_ARF

+  if (cpi->multi_arf_enabled) {

+    fprintf(fp_out, "      idx(%d, %d, %d, %d) active(%d, %d, %d)",

+        cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx, cm->new_fb_idx,

+        cm->active_ref_idx[0], cm->active_ref_idx[1], cm->active_ref_idx[2]);

+    if (cpi->refresh_alt_ref_frame)

+      fprintf(fp_out, "  type:ARF");

+    if (cpi->is_src_frame_alt_ref)

+      fprintf(fp_out, "  type:OVERLAY[%d]", cpi->alt_fb_idx);

+    fprintf(fp_out, "\n");

+  }

+#endif

+  cm->frame_type = INTER_FRAME;

+  cm->frame_flags = *frame_flags;

+  // Reset the frame pointers to the current frame size

+  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],

+                           cm->width, cm->height,

+                           cm->subsampling_x, cm->subsampling_y,

+                           VP9BORDERINPIXELS);

+  // Calculate scaling factors for each of the 3 available references

+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)

+    vp9_setup_scale_factors(cm, i);

   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);

   if (cpi->pass == 1) {

     Pass1Encode(cpi, size, dest, frame_flags);

   } else if (cpi->pass == 2) {

@@ -3852,10 +3648,8 @@

     encode_frame_to_data_rate(cpi, size, dest, frame_flags);

-  if (cm->refresh_entropy_probs) {

-    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,

-               sizeof(cm->fc));

-  }

+  if (cm->refresh_frame_context)

+    cm->frame_contexts[cm->frame_context_idx] = cm->fc;

   if (*size > 0) {

     // if its a dropped frame honor the requests on subsequent frames

@@ -3862,20 +3656,19 @@

     cpi->droppable = !frame_is_reference(cpi);

     // return to normal state

-    cm->refresh_entropy_probs = 1;

+    cm->reset_frame_context = 0;

+    cm->refresh_frame_context = 1;

     cpi->refresh_alt_ref_frame = 0;

     cpi->refresh_golden_frame = 0;

     cpi->refresh_last_frame = 1;

     cm->frame_type = INTER_FRAME;

   vpx_usec_timer_mark(&cmptimer);

   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);

-  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {

+  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)

     generate_psnr_packet(cpi);

-  }

 #if CONFIG_INTERNAL_STATS

@@ -3923,7 +3716,7 @@

           double weight = 0;

 #if CONFIG_POSTPROC

           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,

-                      cm->filter_level * 10 / 6, 1, 0);

+                      cm->filter_level * 10 / 6);

 #endif

           vp9_clear_system_state();

@@ -3950,10 +3743,16 @@

           cpi->totalp  += frame_psnr2;

           frame_ssim2 = vp9_calc_ssim(cpi->Source,

-                                      &cm->post_proc_buffer, 1, &weight);

+                                      recon, 1, &weight);

           cpi->summed_quality += frame_ssim2 * weight;

           cpi->summed_weights += weight;

+          frame_ssim2 = vp9_calc_ssim(cpi->Source,

+                                      &cm->post_proc_buffer, 1, &weight);

+          cpi->summedp_quality += frame_ssim2 * weight;

+          cpi->summedp_weights += weight;

 #if 0

             FILE *f = fopen("q_used.stt", "a");

@@ -3975,12 +3774,11 @@

         cpi->total_ssimg_v += v;

         cpi->total_ssimg_all += frame_all;

 #endif

+  // fclose(fp_out);

   return 0;

@@ -4013,8 +3811,9 @@

 int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,

-                   unsigned int cols, int delta_q[4], int delta_lf[4],

-                   unsigned int threshold[4]) {

+                   unsigned int cols, int delta_q[MAX_MB_SEGMENTS],

+                   int delta_lf[MAX_MB_SEGMENTS],

+                   unsigned int threshold[MAX_MB_SEGMENTS]) {

   VP9_COMP *cpi = (VP9_COMP *) comp;

   signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];

   MACROBLOCKD *xd = &cpi->mb.e_mbd;

@@ -4034,25 +3833,15 @@

   // Activate segmentation.

   vp9_enable_segmentation((VP9_PTR)cpi);

-  // Set up the quant segment data

-  feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];

-  feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];

-  feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];

-  feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];

+  // Set up the quan, LF and breakout threshold segment data

+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+    feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];

+    feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];

+    cpi->segment_encode_breakout[i] = threshold[i];

+  }

-  // Set up the loop segment data s

-  feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];

-  feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];

-  feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];

-  feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];

-  cpi->segment_encode_breakout[0] = threshold[0];

-  cpi->segment_encode_breakout[1] = threshold[1];

-  cpi->segment_encode_breakout[2] = threshold[2];

-  cpi->segment_encode_breakout[3] = threshold[3];

   // Enable the loop and quant changes in the feature mask

-  for (i = 0; i < 4; i++) {

+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {

     if (delta_q[i])

       vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);

     else

@@ -4079,8 +3868,9 @@

     if (map) {

       vpx_memcpy(cpi->active_map, map, rows * cols);

       cpi->active_map_enabled = 1;

-    } else

+    } else {

       cpi->active_map_enabled = 0;

+    }

     return 0;

   } else {

@@ -4095,12 +3885,9 @@

   VP9_COMMON *cm = &cpi->common;

   int hr = 0, hs = 0, vr = 0, vs = 0;

-  if (horiz_mode > ONETWO)

+  if (horiz_mode > ONETWO || vert_mode > ONETWO)

     return -1;

-  if (vert_mode > ONETWO)

-    return -1;

   Scale2Ratio(horiz_mode, &hr, &hs);

   Scale2Ratio(vert_mode, &vr, &vs);

@@ -4141,6 +3928,5 @@

 int vp9_get_quantizer(VP9_PTR c) {

-  VP9_COMP   *cpi = (VP9_COMP *) c;

-  return cpi->common.base_qindex;

+  return ((VP9_COMP *)c)->common.base_qindex;

--- a/vp9/encoder/vp9_onyx_int.h

+++ b/vp9/encoder/vp9_onyx_int.h

@@ -30,24 +30,25 @@

 #include "vp9/encoder/vp9_lookahead.h"

 // Experimental rate control switches

-// #define ONE_SHOT_Q_ESTIMATE 1

-// #define STRICT_ONE_SHOT_Q 1

-// #define DISABLE_RC_LONG_TERM_MEM 1

+#if CONFIG_ONESHOTQ

+#define ONE_SHOT_Q_ESTIMATE 0

+#define STRICT_ONE_SHOT_Q 0

+#define DISABLE_RC_LONG_TERM_MEM 0

+#endif

 // #define SPEEDSTATS 1

+#if CONFIG_MULTIPLE_ARF

+// Set MIN_GF_INTERVAL to 1 for the full decomposition.

+#define MIN_GF_INTERVAL             2

+#else

 #define MIN_GF_INTERVAL             4

+#endif

 #define DEFAULT_GF_INTERVAL         7

 #define KEY_FRAME_CONTEXT 5

-#define MAX_LAG_BUFFERS 25

+#define MAX_MODES 36

-#if CONFIG_COMP_INTERINTRA_PRED

-#define MAX_MODES 54

-#else

-#define MAX_MODES 42

-#endif

 #define MIN_THRESHMULT  32

 #define MAX_THRESHMULT  512

@@ -63,63 +64,35 @@

   int nmvcosts[2][MV_VALS];

   int nmvcosts_hp[2][MV_VALS];

-#ifdef MODE_STATS

-  // Stats

-  int y_modes[VP9_YMODES];

-  int uv_modes[VP9_UV_MODES];

-  int i8x8_modes[VP9_I8X8_MODES];

-  int b_modes[B_MODE_COUNT];

-  int inter_y_modes[MB_MODE_COUNT];

-  int inter_uv_modes[VP9_UV_MODES];

-  int inter_b_modes[B_MODE_COUNT];

-#endif

   vp9_prob segment_pred_probs[PREDICTION_PROBS];

-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];

-  vp9_prob ref_pred_probs[PREDICTION_PROBS];

-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

+  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];

+  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];

+  vp9_prob single_ref_prob[REF_CONTEXTS][2];

+  vp9_prob comp_ref_prob[REF_CONTEXTS];

   unsigned char *last_frame_seg_map_copy;

   // 0 = Intra, Last, GF, ARF

   signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];

-  // 0 = BPRED, ZERO_MV, MV, SPLIT

+  // 0 = ZERO_MV, MV

   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];

-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];

-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];

-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];

-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];

+  vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];

-  vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];

-  vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */

-  vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];

-  vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];

-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];

-  vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];

+  vp9_prob y_mode_prob[4][VP9_INTRA_MODES - 1];

+  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];

+  vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];

   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

                                  [VP9_SWITCHABLE_FILTERS - 1];

-#if CONFIG_COMP_INTERINTRA_PRED

-  vp9_prob interintra_prob;

-#endif

-  int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];

-  int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];

+  int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];

+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_prob nzc_probs_4x4

-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];

-  vp9_prob nzc_probs_8x8

-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];

-  vp9_prob nzc_probs_16x16

-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];

-  vp9_prob nzc_probs_32x32

-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];

-  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]

-                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];

-#endif

+  vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];

+  vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];

+  vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];

+  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];

 } CODING_CONTEXT;

 typedef struct {

@@ -142,8 +115,7 @@

   double new_mv_count;

   double duration;

   double count;

-}

-FIRSTPASS_STATS;

+} FIRSTPASS_STATS;

 typedef struct {

   int frames_so_far;

@@ -155,7 +127,6 @@

   double frame_mvr_abs;

   double frame_mvc;

   double frame_mvc_abs;

 } ONEPASS_FRAMESTATS;

 typedef struct {

@@ -207,12 +178,7 @@

   THR_SPLITA,

   THR_B_PRED,

-  THR_I8X8_PRED,

-  THR_COMP_ZEROLG,

-  THR_COMP_NEARESTLG,

-  THR_COMP_NEARLG,

   THR_COMP_ZEROLA,

   THR_COMP_NEARESTLA,

   THR_COMP_NEARLA,

@@ -221,32 +187,13 @@

   THR_COMP_NEARESTGA,

   THR_COMP_NEARGA,

-  THR_COMP_NEWLG,

   THR_COMP_NEWLA,

   THR_COMP_NEWGA,

-  THR_COMP_SPLITLG,

   THR_COMP_SPLITLA,

   THR_COMP_SPLITGA,

-#if CONFIG_COMP_INTERINTRA_PRED

-  THR_COMP_INTERINTRA_ZEROL,

-  THR_COMP_INTERINTRA_NEARESTL,

-  THR_COMP_INTERINTRA_NEARL,

-  THR_COMP_INTERINTRA_NEWL,

+} THR_MODES;

-  THR_COMP_INTERINTRA_ZEROG,

-  THR_COMP_INTERINTRA_NEARESTG,

-  THR_COMP_INTERINTRA_NEARG,

-  THR_COMP_INTERINTRA_NEWG,

-  THR_COMP_INTERINTRA_ZEROA,

-  THR_COMP_INTERINTRA_NEARESTA,

-  THR_COMP_INTERINTRA_NEARA,

-  THR_COMP_INTERINTRA_NEWA,

-#endif

-}

-THR_MODES;

 typedef enum {

   DIAMOND = 0,

   NSTEP = 1,

@@ -256,7 +203,6 @@

 typedef struct {

   int RD;

   SEARCH_METHODS search_method;

-  int improved_dct;

   int auto_filter;

   int recode_loop;

   int iterative_sub_pixel;

@@ -266,41 +212,25 @@

   int max_step_search_steps;

   int first_step;

   int optimize_coefficients;

-  int no_skip_block4x4_search;

   int search_best_filter;

-  int splitmode_breakout;

-  int mb16_breakout;

   int static_segmentation;

+  int comp_inter_joint_search_thresh;

+  int adpative_rd_thresh;

 } SPEED_FEATURES;

-typedef struct {

-  MACROBLOCK  mb;

-  int totalrate;

-} MB_ROW_COMP;

-typedef struct {

-  TOKENEXTRA *start;

-  TOKENEXTRA *stop;

-} TOKENLIST;

-typedef struct {

-  int ithread;

-  void *ptr1;

-  void *ptr2;

-} ENCODETHREAD_DATA;

-typedef struct {

-  int ithread;

-  void *ptr1;

-} LPFTHREAD_DATA;

 enum BlockSize {

-  BLOCK_16X8 = PARTITIONING_16X8,

-  BLOCK_8X16 = PARTITIONING_8X16,

-  BLOCK_8X8 = PARTITIONING_8X8,

-  BLOCK_4X4 = PARTITIONING_4X4,

+  BLOCK_4X4,

+  BLOCK_4X8,

+  BLOCK_8X4,

+  BLOCK_8X8,

+  BLOCK_8X16,

+  BLOCK_16X8,

   BLOCK_16X16,

-  BLOCK_MAX_SEGMENTS,

-  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,

+  BLOCK_32X32,

+  BLOCK_32X16,

+  BLOCK_16X32,

+  BLOCK_64X32,

+  BLOCK_32X64,

   BLOCK_64X64,

   BLOCK_MAX_SB_SEGMENTS,

};

@@ -307,17 +237,25 @@

 typedef struct VP9_COMP {

-  DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, y_quant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, unsigned char, y_quant_shift[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, y_zbin[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, y_round[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, uv_quant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, unsigned char, uv_quant_shift[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);

+#if CONFIG_ALPHA

+  DECLARE_ALIGNED(16, short, a_quant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);

+#endif

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);

   MACROBLOCK mb;

@@ -326,7 +264,11 @@

   struct lookahead_ctx    *lookahead;

   struct lookahead_entry  *source;

+#if CONFIG_MULTIPLE_ARF

+  struct lookahead_entry  *alt_ref_source[NUM_REF_FRAMES];

+#else

   struct lookahead_entry  *alt_ref_source;

+#endif

   YV12_BUFFER_CONFIG *Source;

   YV12_BUFFER_CONFIG *un_scaled_source;

@@ -345,6 +287,9 @@

   int lst_fb_idx;

   int gld_fb_idx;

   int alt_fb_idx;

+#if CONFIG_MULTIPLE_ARF

+  int alt_ref_fb_idx[NUM_REF_FRAMES - 3];

+#endif

   int refresh_last_frame;

   int refresh_golden_frame;

   int refresh_alt_ref_frame;

@@ -358,6 +303,12 @@

   unsigned int key_frame_frequency;

   unsigned int this_key_frame_forced;

   unsigned int next_key_frame_forced;

+#if CONFIG_MULTIPLE_ARF

+  // Position within a frame coding order (including any additional ARF frames).

+  unsigned int sequence_number;

+  // Next frame in naturally occurring order that has not yet been coded.

+  int next_frame_in_order;

+#endif

   // Ambient reconstruction err target for force key frames

   int ambient_err;

@@ -367,16 +318,19 @@

   unsigned int mode_chosen_counts[MAX_MODES];

   int rd_thresh_mult[MAX_MODES];

-  int rd_baseline_thresh[MAX_MODES];

-  int rd_threshes[MAX_MODES];

+  int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES];

+  int rd_threshes[BLOCK_SIZE_TYPES][MAX_MODES];

+  int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES];

   int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];

   int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];

-  int comp_pred_count[COMP_PRED_CONTEXTS];

-  int single_pred_count[COMP_PRED_CONTEXTS];

+  unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];

+  unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];

+  unsigned int single_ref_count[REF_CONTEXTS][2][2];

+  unsigned int comp_ref_count[REF_CONTEXTS][2];

   // FIXME contextualize

-  int txfm_count_32x32p[TX_SIZE_MAX_SB];

-  int txfm_count_16x16p[TX_SIZE_MAX_MB];

-  int txfm_count_8x8p[TX_SIZE_MAX_MB - 1];

   int64_t rd_tx_select_diff[NB_TXFM_MODES];

   int rd_tx_select_threshes[4][NB_TXFM_MODES];

@@ -396,7 +350,6 @@

   double gf_rate_correction_factor;

   int frames_till_gf_update_due;      // Count down till next GF

-  int current_gf_interval;          // GF interval chosen when we coded the last GF

   int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)

@@ -453,57 +406,16 @@

   int cq_target_quality;

-  int sb32_count[2];

-  int sb64_count[2];

-  int sb_ymode_count [VP9_I32X32_MODES];

-  int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */

-  int bmode_count[VP9_NKF_BINTRAMODES];

-  int i8x8_mode_count[VP9_I8X8_MODES];

-  int sub_mv_ref_count[SUBMVREF_COUNT][VP9_SUBMVREFS];

-  int mbsplit_count[VP9_NUMMBSPLITS];

-  int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];

-#if CONFIG_COMP_INTERINTRA_PRED

-  unsigned int interintra_count[2];

-  unsigned int interintra_select_count[2];

-#endif

+  int y_mode_count[4][VP9_INTRA_MODES];

+  int y_uv_mode_count[VP9_INTRA_MODES][VP9_INTRA_MODES];

+  unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];

   nmv_context_counts NMVcount;

-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];

-  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES];

-  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES];

+  vp9_coeff_count coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];

+  vp9_coeff_probs_model frame_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];

+  vp9_coeff_stats frame_branch_ct[TX_SIZE_MAX_SB][BLOCK_TYPES];

-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];

-  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES];

-  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];

-  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];

-  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];

-  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];

-  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_prob frame_nzc_probs_4x4

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];

-  unsigned int frame_nzc_branch_ct_4x4

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES][2];

-  vp9_prob frame_nzc_probs_8x8

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];

-  unsigned int frame_nzc_branch_ct_8x8

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES][2];

-  vp9_prob frame_nzc_probs_16x16

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];

-  unsigned int frame_nzc_branch_ct_16x16

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES][2];

-  vp9_prob frame_nzc_probs_32x32

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];

-  unsigned int frame_nzc_branch_ct_32x32

-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES][2];

-#endif

   int gfu_boost;

   int last_boost;

   int kf_boost;

@@ -521,7 +433,6 @@

   int mbgraph_n_frames;             // number of frames filled in the above

   int static_mb_pct;                // % forced skip mbs by segmentation

   int seg0_progress, seg0_idx, seg0_cnt;

-  int ref_pred_count[3][2];

   int decimation_factor;

   int decimation_count;

@@ -529,7 +440,7 @@

   // for real time encoding

   int avg_encode_time;              // microsecond

   int avg_pick_mode_time;            // microsecond

-  int Speed;

+  int speed;

   unsigned int cpu_freq;           // Mhz

   int compressor_speed;

@@ -542,12 +453,8 @@

   vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];

   int last_skip_probs_q[3];

-  int recent_ref_frame_usage[MAX_REF_FRAMES];

-  int count_mb_ref_frame_usage[MAX_REF_FRAMES];

   int ref_frame_flags;

-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];

   SPEED_FEATURES sf;

   int error_bins[1024];

@@ -555,8 +462,6 @@

   int inter_zz_count;

   int gf_bad_count;

   int gf_update_recommended;

-  int skip_true_count[3];

-  int skip_false_count[3];

   unsigned char *segmentation_map;

@@ -566,8 +471,6 @@

   unsigned char *active_map;

   unsigned int active_map_enabled;

-  TOKENLIST *tplist;

   fractional_mv_step_fp *find_fractional_mv_step;

   vp9_full_search_fn_t full_search_sad;

   vp9_refining_search_fn_t refining_search_sad;

@@ -578,16 +481,14 @@

   uint64_t time_pick_lpf;

   uint64_t time_encode_mb_row;

-  int base_skip_false_prob[QINDEX_RANGE][3];

   struct twopass_rc {

     unsigned int section_intra_rating;

     unsigned int next_iiratio;

     unsigned int this_iiratio;

-    FIRSTPASS_STATS *total_stats;

-    FIRSTPASS_STATS *this_frame_stats;

+    FIRSTPASS_STATS total_stats;

+    FIRSTPASS_STATS this_frame_stats;

     FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;

-    FIRSTPASS_STATS *total_left_stats;

+    FIRSTPASS_STATS total_left_stats;

     int first_pass_done;

     int64_t bits_left;

     int64_t clip_bits_total;

@@ -640,6 +541,8 @@

   int    bytes;

   double summed_quality;

   double summed_weights;

+  double summedp_quality;

+  double summedp_weights;

   unsigned int tot_recode_hits;

@@ -656,19 +559,8 @@

   unsigned int activity_avg;

   unsigned int *mb_activity_map;

   int *mb_norm_activity_map;

-  // Record of which MBs still refer to last golden frame either

-  // directly or through 0,0

-  unsigned char *gf_active_flags;

-  int gf_active_count;

   int output_partition;

-  // Store last frame's MV info for next frame MV prediction

-  int_mv *lfmv;

-  int *lf_ref_frame_sign_bias;

-  int *lf_ref_frame;

   /* force next frame to intra when kf_auto says so */

   int force_next_frame_intra;

@@ -680,13 +572,36 @@

                                       [VP9_SWITCHABLE_FILTERS];

   unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];

-#if CONFIG_NEW_MVREF

-  unsigned int mb_mv_ref_count[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];

-#endif

   int initial_width;

   int initial_height;

+#if CONFIG_MULTIPLE_ARF

+  // ARF tracking variables.

+  int multi_arf_enabled;

+  unsigned int frame_coding_order_period;

+  unsigned int new_frame_coding_order_period;

+  int frame_coding_order[MAX_LAG_BUFFERS * 2];

+  int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2];

+  int arf_weight[MAX_LAG_BUFFERS];

+  int arf_buffered;

+  int this_frame_weight;

+  int max_arf_level;

+#endif

+#ifdef ENTROPY_STATS

+  int64_t mv_ref_stats[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];

+#endif

 } VP9_COMP;

+static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {

+  if (ref_frame == LAST_FRAME) {

+    return cpi->lst_fb_idx;

+  } else if (ref_frame == GOLDEN_FRAME) {

+    return cpi->gld_fb_idx;

+  } else {

+    return cpi->alt_fb_idx;

+  }

+}

 void vp9_encode_frame(VP9_COMP *cpi);

--- a/vp9/encoder/vp9_picklpf.c

+++ b/vp9/encoder/vp9_picklpf.c

@@ -120,112 +120,7 @@

   return max_filter_level;

-void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int best_err = 0;

-  int filt_err = 0;

-  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);

-  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);

-  int filt_val;

-  int best_filt_val = cm->filter_level;

-  //  Make a copy of the unfiltered / processed recon buffer

-  vp9_yv12_copy_partial_frame(cm->frame_to_show, &cpi->last_frame_uf, 3);

-  if (cm->frame_type == KEY_FRAME)

-    cm->sharpness_level = 0;

-  else

-    cm->sharpness_level = cpi->oxcf.Sharpness;

-  if (cm->sharpness_level != cm->last_sharpness_level) {

-    vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);

-    cm->last_sharpness_level = cm->sharpness_level;

-  }

-  // Start the search at the previous frame filter level unless it is now out of range.

-  if (cm->filter_level < min_filter_level)

-    cm->filter_level = min_filter_level;

-  else if (cm->filter_level > max_filter_level)

-    cm->filter_level = max_filter_level;

-  filt_val = cm->filter_level;

-  best_filt_val = filt_val;

-  // Get the err using the previous frame's filter value.

-  vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

-  best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

-  //  Re-instate the unfiltered frame

-  vp9_yv12_copy_partial_frame(&cpi->last_frame_uf, cm->frame_to_show, 3);

-  filt_val -= (1 + ((filt_val > 10) ? 1 : 0));

-  // Search lower filter levels

-  while (filt_val >= min_filter_level) {

-    // Apply the loop filter

-    vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

-    // Get the err for filtered frame

-    filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

-    //  Re-instate the unfiltered frame

-    vp9_yv12_copy_partial_frame(&cpi->last_frame_uf, cm->frame_to_show, 3);

-    // Update the best case record or exit loop.

-    if (filt_err < best_err) {

-      best_err = filt_err;

-      best_filt_val = filt_val;

-    } else

-      break;

-    // Adjust filter level

-    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));

-  }

-  // Search up (note that we have already done filt_val = cm->filter_level)

-  filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));

-  if (best_filt_val == cm->filter_level) {

-    // Resist raising filter level for very small gains

-    best_err -= (best_err >> 10);

-    while (filt_val < max_filter_level) {

-      // Apply the loop filter

-      vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

-      // Get the err for filtered frame

-      filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

-      //  Re-instate the unfiltered frame

-      vp9_yv12_copy_partial_frame(&cpi->last_frame_uf,

-                                      cm->frame_to_show, 3);

-      // Update the best case record or exit loop.

-      if (filt_err < best_err) {

-        // Do not raise filter level if improvement is < 1 part in 4096

-        best_err = filt_err - (filt_err >> 10);

-        best_filt_val = filt_val;

-      } else

-        break;

-      // Adjust filter level

-      filt_val += (1 + ((filt_val > 10) ? 1 : 0));

-    }

-  }

-  cm->filter_level = best_filt_val;

-  if (cm->filter_level < min_filter_level)

-    cm->filter_level = min_filter_level;

-  if (cm->filter_level > max_filter_level)

-    cm->filter_level = max_filter_level;

-}

 // Stub function for now Alt LF not used

 void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {

@@ -268,7 +163,7 @@

   // Get baseline error score

   vp9_set_alt_lf_level(cpi, filt_mid);

-  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, 0);

+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1);

   best_err = vp9_calc_ss_err(sd, cm->frame_to_show);

   filt_best = filt_mid;

@@ -293,7 +188,7 @@

     if ((filt_direction <= 0) && (filt_low != filt_mid)) {

       // Get Low filter error score

       vp9_set_alt_lf_level(cpi, filt_low);

-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, 0);

+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1);

       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

@@ -313,7 +208,7 @@

     // Now look at filt_high

     if ((filt_direction >= 0) && (filt_high != filt_mid)) {

       vp9_set_alt_lf_level(cpi, filt_high);

-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, 0);

+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1);

       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

@@ -338,30 +233,4 @@

   cm->filter_level = filt_best;

-#if CONFIG_LOOP_DERING

-  /* Decide whether to turn on deringing filter */

-  {  // NOLINT

-    int best_dering = 0;

-    int this_dering;

-    int last_err_diff = INT_MAX;

-    for (this_dering = 1; this_dering <= 16; this_dering++) {

-      vp9_set_alt_lf_level(cpi, filt_best);

-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, this_dering);

-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

-      vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);

-      if (filt_err < best_err) {

-        best_err = filt_err;

-        best_dering = this_dering;

-        last_err_diff = INT_MAX;

-      } else {

-        if (filt_err - best_err > last_err_diff)

-          break;

-        last_err_diff = filt_err - best_err;

-      }

-    }

-    cm->dering_enabled = best_dering;

-  }

-#endif

--- a/vp9/encoder/vp9_picklpf.h

+++ b/vp9/encoder/vp9_picklpf.h

@@ -15,9 +15,6 @@

 struct yv12_buffer_config;

 struct VP9_COMP;

-void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd,

-                                struct VP9_COMP *cpi);

 void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);

 void vp9_pick_filter_level(struct yv12_buffer_config *sd,

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -21,340 +21,12 @@

 extern int enc_debug;

 #endif

-static INLINE int plane_idx(MACROBLOCKD *xd, int b_idx) {

-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-  if (b_idx < (16 << (sb_type * 2)))

-    return 0;  // Y

-  else if (b_idx < (20 << (sb_type * 2)))

-    return 16;  // U

-  assert(b_idx < (24 << (sb_type * 2)));

-  return 20;  // V

+static INLINE int plane_idx(int plane) {

+  return plane == 0 ? 0 :

+         plane == 1 ? 16 : 20;

-void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  BLOCK *const b = &mb->block[0];

-  BLOCKD *const d = &xd->block[0];

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;

-  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;

-  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;

-  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;

-  int16_t *zbin_ptr        = b->zbin;

-  int16_t *round_ptr       = b->round;

-  int16_t *quant_ptr       = b->quant;

-  uint8_t *quant_shift_ptr = b->quant_shift;

-  int16_t *dequant_ptr     = d->dequant;

-  int zbin_oq_value        = b->zbin_extra;

-  const int *pt_scan;

-#if CONFIG_CODE_NONZEROCOUNT

-  int nzc = 0;

-#endif

-  assert(plane_idx(xd, b_idx) == 0);

-  switch (tx_type) {

-    case ADST_DCT:

-      pt_scan = vp9_row_scan_4x4;

-      break;

-    case DCT_ADST:

-      pt_scan = vp9_col_scan_4x4;

-      break;

-    default:

-      pt_scan = vp9_default_zig_zag1d_4x4;

-      break;

-  }

-  vpx_memset(qcoeff_ptr, 0, 32);

-  vpx_memset(dqcoeff_ptr, 0, 32);

-  eob = -1;

-  if (!b->skip_block) {

-    for (i = 0; i < 16; i++) {

-      rc   = pt_scan[i];

-      z    = coeff_ptr[rc];

-      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

-      zbin_boost_ptr++;

-      sz = (z >> 31);                                 // sign of z

-      x  = (z ^ sz) - sz;                             // x = abs(z)

-      if (x >= zbin) {

-        x += round_ptr[rc];

-        y  = (((x * quant_ptr[rc]) >> 16) + x)

-             >> quant_shift_ptr[rc];                // quantize (x)

-        x  = (y ^ sz) - sz;                         // get the sign back

-        qcoeff_ptr[rc]  = x;                        // write to destination

-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-        if (y) {

-          eob = i;                                // last nonzero coeffs

-#if CONFIG_CODE_NONZEROCOUNT

-          ++nzc;                                  // number of nonzero coeffs

-#endif

-          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

-        }

-      }

-    }

-  }

-  xd->eobs[b_idx] = eob + 1;

-#if CONFIG_CODE_NONZEROCOUNT

-  xd->nzcs[b_idx] = nzc;

-#endif

-}

-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  const int c_idx = plane_idx(xd, b_idx);

-  BLOCK *const b = &mb->block[c_idx];

-  BLOCKD *const d = &xd->block[c_idx];

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;

-  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;

-  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;

-  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;

-  int16_t *zbin_ptr        = b->zbin;

-  int16_t *round_ptr       = b->round;

-  int16_t *quant_ptr       = b->quant;

-  uint8_t *quant_shift_ptr = b->quant_shift;

-  int16_t *dequant_ptr     = d->dequant;

-  int zbin_oq_value        = b->zbin_extra;

-#if CONFIG_CODE_NONZEROCOUNT

-  int nzc = 0;

-#endif

-  vpx_memset(qcoeff_ptr, 0, 32);

-  vpx_memset(dqcoeff_ptr, 0, 32);

-  eob = -1;

-  if (!b->skip_block) {

-    for (i = 0; i < 16; i++) {

-      rc   = vp9_default_zig_zag1d_4x4[i];

-      z    = coeff_ptr[rc];

-      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

-      zbin_boost_ptr++;

-      sz = (z >> 31);                                 // sign of z

-      x  = (z ^ sz) - sz;                             // x = abs(z)

-      if (x >= zbin) {

-        x += round_ptr[rc];

-        y  = (((x * quant_ptr[rc]) >> 16) + x)

-             >> quant_shift_ptr[rc];                // quantize (x)

-        x  = (y ^ sz) - sz;                         // get the sign back

-        qcoeff_ptr[rc]  = x;                        // write to destination

-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-        if (y) {

-          eob = i;                                // last nonzero coeffs

-#if CONFIG_CODE_NONZEROCOUNT

-          ++nzc;                                  // number of nonzero coeffs

-#endif

-          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

-        }

-      }

-    }

-  }

-  xd->eobs[b_idx] = eob + 1;

-#if CONFIG_CODE_NONZEROCOUNT

-  xd->nzcs[b_idx] = nzc;

-#endif

-}

-void vp9_quantize_mby_4x4(MACROBLOCK *x) {

-  int i;

-  for (i = 0; i < 16; i++) {

-    TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, i);

-    if (tx_type != DCT_DCT) {

-      vp9_ht_quantize_b_4x4(x, i, tx_type);

-    } else {

-      x->quantize_b_4x4(x, i);

-    }

-  }

-}

-void vp9_quantize_mbuv_4x4(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i++)

-    x->quantize_b_4x4(x, i);

-}

-void vp9_quantize_mb_4x4(MACROBLOCK *x) {

-  vp9_quantize_mby_4x4(x);

-  vp9_quantize_mbuv_4x4(x);

-}

-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  int16_t *qcoeff_ptr = xd->qcoeff + 16 * b_idx;

-  int16_t *dqcoeff_ptr = xd->dqcoeff + 16 * b_idx;

-  const int c_idx = plane_idx(xd, b_idx);

-  BLOCK *const b = &mb->block[c_idx];

-  BLOCKD *const d = &xd->block[c_idx];

-  const int *pt_scan;

-  switch (tx_type) {

-    case ADST_DCT:

-      pt_scan = vp9_row_scan_8x8;

-      break;

-    case DCT_ADST:

-      pt_scan = vp9_col_scan_8x8;

-      break;

-    default:

-      pt_scan = vp9_default_zig_zag1d_8x8;

-      break;

-  }

-  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));

-  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));

-  if (!b->skip_block) {

-    int i, rc, eob;

-    int zbin;

-    int x, y, z, sz;

-    int zero_run;

-    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;

-    int16_t *coeff_ptr  = mb->coeff + 16 * b_idx;

-    int16_t *zbin_ptr   = b->zbin;

-    int16_t *round_ptr  = b->round;

-    int16_t *quant_ptr  = b->quant;

-    uint8_t *quant_shift_ptr = b->quant_shift;

-    int16_t *dequant_ptr = d->dequant;

-    int zbin_oq_value = b->zbin_extra;

-#if CONFIG_CODE_NONZEROCOUNT

-    int nzc = 0;

-#endif

-    eob = -1;

-    // Special case for DC as it is the one triggering access in various

-    // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0]

-    {

-      z    = coeff_ptr[0];

-      zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value);

-      zero_run = 1;

-      sz = (z >> 31);                                // sign of z

-      x  = (z ^ sz) - sz;                            // x = abs(z)

-      if (x >= zbin) {

-        x += (round_ptr[0]);

-        y  = ((int)(((int)(x * quant_ptr[0]) >> 16) + x))

-             >> quant_shift_ptr[0];                  // quantize (x)

-        x  = (y ^ sz) - sz;                          // get the sign back

-        qcoeff_ptr[0]  = x;                          // write to destination

-        dqcoeff_ptr[0] = x * dequant_ptr[0];         // dequantized value

-        if (y) {

-          eob = 0;                                   // last nonzero coeffs

-#if CONFIG_CODE_NONZEROCOUNT

-          ++nzc;                                  // number of nonzero coeffs

-#endif

-          zero_run = 0;

-        }

-      }

-    }

-    for (i = 1; i < 64; i++) {

-      rc   = pt_scan[i];

-      z    = coeff_ptr[rc];

-      zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value);

-      // The original code was incrementing zero_run while keeping it at

-      // maximum 15 by adding "(zero_run < 15)". The same is achieved by

-      // removing the opposite of the sign mask of "(zero_run - 15)".

-      zero_run -= (zero_run - 15) >> 31;

-      sz = (z >> 31);                                // sign of z

-      x  = (z ^ sz) - sz;                            // x = abs(z)

-      if (x >= zbin) {

-        x += (round_ptr[rc != 0]);

-        y  = ((int)(((int)(x * quant_ptr[1]) >> 16) + x))

-             >> quant_shift_ptr[1];                  // quantize (x)

-        x  = (y ^ sz) - sz;                          // get the sign back

-        qcoeff_ptr[rc]  = x;                         // write to destination

-        dqcoeff_ptr[rc] = x * dequant_ptr[1];        // dequantized value

-        if (y) {

-          eob = i;                                   // last nonzero coeffs

-#if CONFIG_CODE_NONZEROCOUNT

-          ++nzc;                                     // number of nonzero coeffs

-#endif

-          zero_run = 0;

-        }

-      }

-    }

-    xd->eobs[b_idx] = eob + 1;

-#if CONFIG_CODE_NONZEROCOUNT

-    xd->nzcs[b_idx] = nzc;

-#endif

-  } else {

-    xd->eobs[b_idx] = 0;

-#if CONFIG_CODE_NONZEROCOUNT

-    xd->nzcs[b_idx] = 0;

-#endif

-  }

-}

-void vp9_quantize_mby_8x8(MACROBLOCK *x) {

-  int i;

-#if CONFIG_CODE_NONZEROCOUNT

-  for (i = 0; i < 16; i ++) {

-    x->e_mbd.nzcs[i] = 0;

-  }

-#endif

-  for (i = 0; i < 16; i += 4) {

-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, (i & 8) + ((i & 4) >> 1));

-    x->quantize_b_8x8(x, i, tx_type);

-  }

-}

-void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {

-  int i;

-#if CONFIG_CODE_NONZEROCOUNT

-  for (i = 16; i < 24; i ++) {

-    x->e_mbd.nzcs[i] = 0;

-  }

-#endif

-  for (i = 16; i < 24; i += 4)

-    x->quantize_b_8x8(x, i, DCT_DCT);

-}

-void vp9_quantize_mb_8x8(MACROBLOCK *x) {

-  vp9_quantize_mby_8x8(x);

-  vp9_quantize_mbuv_8x8(x);

-}

-void vp9_quantize_mby_16x16(MACROBLOCK *x) {

-  TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd, 0);

-#if CONFIG_CODE_NONZEROCOUNT

-  int i;

-  for (i = 0; i < 16; i++) {

-    x->e_mbd.nzcs[i] = 0;

-  }

-#endif

-  x->quantize_b_16x16(x, 0, tx_type);

-}

-void vp9_quantize_mb_16x16(MACROBLOCK *x) {

-  vp9_quantize_mby_16x16(x);

-  vp9_quantize_mbuv_8x8(x);

-}

-static void quantize(int16_t *zbin_boost_orig_ptr,

+static void quantize(int16_t *zbin_boost_orig_ptr,

                      int16_t *coeff_ptr, int n_coeffs, int skip_block,

                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,

                      uint8_t *quant_shift_ptr,

@@ -361,9 +33,6 @@

                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

                      int16_t *dequant_ptr, int zbin_oq_value,

                      uint16_t *eob_ptr,

-#if CONFIG_CODE_NONZEROCOUNT

-                     uint16_t *nzc_ptr,

-#endif

                      const int *scan, int mul) {

   int i, rc, eob;

   int zbin;

@@ -370,9 +39,6 @@

   int x, y, z, sz;

   int zero_run = 0;

   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;

-#if CONFIG_CODE_NONZEROCOUNT

-  int nzc = 0;

-#endif

   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));

   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));

@@ -401,9 +67,6 @@

         if (y) {

           eob = i;                                  // last nonzero coeffs

           zero_run = 0;

-#if CONFIG_CODE_NONZEROCOUNT

-          ++nzc;                                    // number of nonzero coeffs

-#endif

@@ -410,200 +73,66 @@

   *eob_ptr = eob + 1;

-#if CONFIG_CODE_NONZEROCOUNT

-  *nzc_ptr = nzc;

-#endif

-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {

+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,

+                  TX_TYPE tx_type) {

   MACROBLOCKD *const xd = &mb->e_mbd;

-  const int c_idx = plane_idx(xd, b_idx);

-  BLOCK *const b = &mb->block[c_idx];

-  BLOCKD *const d = &xd->block[c_idx];

-  const int *pt_scan;

+  const int mul = n_coeffs == 1024 ? 2 : 1;

+  const int *scan;

-  switch (tx_type) {

-    case ADST_DCT:

-      pt_scan = vp9_row_scan_16x16;

+  // These contexts may be available in the caller

+  switch (n_coeffs) {

+    case 4 * 4:

+      scan = get_scan_4x4(tx_type);

       break;

-    case DCT_ADST:

-      pt_scan = vp9_col_scan_16x16;

+    case 8 * 8:

+      scan = get_scan_8x8(tx_type);

       break;

+    case 16 * 16:

+      scan = get_scan_16x16(tx_type);

+      break;

     default:

-      pt_scan = vp9_default_zig_zag1d_16x16;

+      scan = vp9_default_scan_32x32;

       break;

-  quantize(b->zrun_zbin_boost,

-           mb->coeff + 16 * b_idx,

-           256, b->skip_block,

-           b->zbin, b->round, b->quant, b->quant_shift,

-           xd->qcoeff + 16 * b_idx,

-           xd->dqcoeff + 16 * b_idx,

-           d->dequant,

-           b->zbin_extra,

-           &xd->eobs[b_idx],

-#if CONFIG_CODE_NONZEROCOUNT

-           &xd->nzcs[b_idx],

-#endif

-           pt_scan, 1);

+  quantize(mb->plane[plane].zrun_zbin_boost,

+           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),

+           n_coeffs, mb->skip_block,

+           mb->plane[plane].zbin,

+           mb->plane[plane].round,

+           mb->plane[plane].quant,

+           mb->plane[plane].quant_shift,

+           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),

+           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

+           xd->plane[plane].dequant,

+           mb->plane[plane].zbin_extra,

+           &xd->plane[plane].eobs[block],

+           scan, mul);

-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) {

+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,

+                                int y_blocks) {

   MACROBLOCKD *const xd = &mb->e_mbd;

-  const int c_idx = plane_idx(xd, b_idx);

-  BLOCK *const b = &mb->block[c_idx];

-  BLOCKD *const d = &xd->block[c_idx];

+  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);

+  const int *pt_scan = get_scan_4x4(tx_type);

-  quantize(b->zrun_zbin_boost,

-           mb->coeff + b_idx * 16,

-           1024, b->skip_block,

-           b->zbin,

-           b->round, b->quant, b->quant_shift,

-           xd->qcoeff + b_idx * 16,

-           xd->dqcoeff + b_idx * 16,

-           d->dequant,

-           b->zbin_extra,

-           &xd->eobs[b_idx],

-#if CONFIG_CODE_NONZEROCOUNT

-           &xd->nzcs[b_idx],

-#endif

-           vp9_default_zig_zag1d_32x32, 2);

+  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,

+           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),

+           16, mb->skip_block,

+           mb->plane[pb_idx.plane].zbin,

+           mb->plane[pb_idx.plane].round,

+           mb->plane[pb_idx.plane].quant,

+           mb->plane[pb_idx.plane].quant_shift,

+           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),

+           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),

+           xd->plane[pb_idx.plane].dequant,

+           mb->plane[pb_idx.plane].zbin_extra,

+           &xd->plane[pb_idx.plane].eobs[pb_idx.block],

+           pt_scan, 1);

-void vp9_quantize_sby_32x32(MACROBLOCK *x) {

-  vp9_regular_quantize_b_32x32(x, 0);

-}

-void vp9_quantize_sby_16x16(MACROBLOCK *x) {

-  int n;

-  for (n = 0; n < 4; n++) {

-    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,

-                                        (16 * (n & 2)) + ((n & 1) * 4));

-    x->quantize_b_16x16(x, n * 16, tx_type);

-  }

-}

-void vp9_quantize_sby_8x8(MACROBLOCK *x) {

-  int n;

-  for (n = 0; n < 16; n++) {

-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,

-                                      (4 * (n & 12)) + ((n & 3) * 2));

-    x->quantize_b_8x8(x, n * 4, tx_type);

-  }

-}

-void vp9_quantize_sby_4x4(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 64; n++) {

-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);

-    if (tx_type != DCT_DCT) {

-      vp9_ht_quantize_b_4x4(x, n, tx_type);

-    } else {

-      x->quantize_b_4x4(x, n);

-    }

-  }

-}

-void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {

-  x->quantize_b_16x16(x, 64, DCT_DCT);

-  x->quantize_b_16x16(x, 80, DCT_DCT);

-}

-void vp9_quantize_sbuv_8x8(MACROBLOCK *x) {

-  int i;

-  for (i = 64; i < 96; i += 4)

-    x->quantize_b_8x8(x, i, DCT_DCT);

-}

-void vp9_quantize_sbuv_4x4(MACROBLOCK *x) {

-  int i;

-  for (i = 64; i < 96; i++)

-    x->quantize_b_4x4(x, i);

-}

-void vp9_quantize_sb64y_32x32(MACROBLOCK *x) {

-  int n;

-  for (n = 0; n < 4; n++)

-    vp9_regular_quantize_b_32x32(x, n * 64);

-}

-void vp9_quantize_sb64y_16x16(MACROBLOCK *x) {

-  int n;

-  for (n = 0; n < 16; n++) {

-    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,

-                                        (16 * (n & 12)) + ((n & 3) * 4));

-    x->quantize_b_16x16(x, n * 16, tx_type);

-  }

-}

-void vp9_quantize_sb64y_8x8(MACROBLOCK *x) {

-  int n;

-  for (n = 0; n < 64; n++) {

-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,

-                                      (4 * (n & 56)) + ((n & 7) * 2));

-    x->quantize_b_8x8(x, n * 4, tx_type);

-  }

-}

-void vp9_quantize_sb64y_4x4(MACROBLOCK *x) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int n;

-  for (n = 0; n < 256; n++) {

-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);

-    if (tx_type != DCT_DCT) {

-      vp9_ht_quantize_b_4x4(x, n, tx_type);

-    } else {

-      x->quantize_b_4x4(x, n);

-    }

-  }

-}

-void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) {

-  vp9_regular_quantize_b_32x32(x, 256);

-  vp9_regular_quantize_b_32x32(x, 320);

-}

-void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) {

-  int i;

-  for (i = 256; i < 384; i += 16)

-    x->quantize_b_16x16(x, i, DCT_DCT);

-}

-void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) {

-  int i;

-  for (i = 256; i < 384; i += 4)

-    x->quantize_b_8x8(x, i, DCT_DCT);

-}

-void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) {

-  int i;

-  for (i = 256; i < 384; i++)

-    x->quantize_b_4x4(x, i);

-}

-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of

- * these two C functions if corresponding optimized routine is not available.

- * NEON optimized version implements currently the fast quantization for pair

- * of blocks. */

-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2) {

-  vp9_regular_quantize_b_4x4(x, b_idx1);

-  vp9_regular_quantize_b_4x4(x, b_idx2);

-}

 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {

   unsigned t;

   int l;

@@ -618,6 +147,10 @@

 void vp9_init_quantizer(VP9_COMP *cpi) {

   int i;

   int quant_val;

+  int quant_uv_val;

+#if CONFIG_ALPHA

+  int quant_alpha_val;

+#endif

   int q;

   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,

@@ -631,39 +164,63 @@

       qrounding_factor = 64;

     // dc values

-    quant_val = vp9_dc_quant(q, cpi->common.y1dc_delta_q);

-    invert_quant(cpi->Y1quant[q] + 0, cpi->Y1quant_shift[q] + 0, quant_val);

-    cpi->Y1zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

-    cpi->Y1round[q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.Y1dequant[q][0] = quant_val;

-    cpi->zrun_zbin_boost_y1[q][0] = (quant_val * zbin_boost[0]) >> 7;

+    quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);

+    invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);

+    cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+    cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.y_dequant[q][0] = quant_val;

+    cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;

-    quant_val = vp9_dc_uv_quant(q, cpi->common.uvdc_delta_q);

-    invert_quant(cpi->UVquant[q] + 0, cpi->UVquant_shift[q] + 0, quant_val);

-    cpi->UVzbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

-    cpi->UVround[q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.UVdequant[q][0] = quant_val;

+    quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);

+    invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);

+    cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+    cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.uv_dequant[q][0] = quant_val;

     cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;

+#if CONFIG_ALPHA

+    quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);

+    invert_quant(cpi->a_quant[q] + 0, cpi->a_quant_shift[q] + 0, quant_val);

+    cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+    cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.a_dequant[q][0] = quant_val;

+    cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;

+#endif

+    quant_val = vp9_ac_quant(q, 0);

+    cpi->common.y_dequant[q][1] = quant_val;

+    quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);

+    cpi->common.uv_dequant[q][1] = quant_uv_val;

+#if CONFIG_ALPHA

+    quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);

+    cpi->common.a_dequant[q][1] = quant_alpha_val;

+#endif

     // all the 4x4 ac values =;

     for (i = 1; i < 16; i++) {

-      int rc = vp9_default_zig_zag1d_4x4[i];

+      int rc = vp9_default_scan_4x4[i];

-      quant_val = vp9_ac_yquant(q);

-      invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);

-      cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

-      cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.Y1dequant[q][rc] = quant_val;

-      cpi->zrun_zbin_boost_y1[q][i] =

+      invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);

+      cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+      cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;

+      cpi->zrun_zbin_boost_y[q][i] =

           ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);

-      quant_val = vp9_ac_uv_quant(q, cpi->common.uvac_delta_q);

-      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val);

-      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

-      cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.UVdequant[q][rc] = quant_val;

+      invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,

+        quant_uv_val);

+      cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);

+      cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;

       cpi->zrun_zbin_boost_uv[q][i] =

-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);

+          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);

+#if CONFIG_ALPHA

+      invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,

+          quant_alpha_val);

+      cpi->a_zbin[q][rc] =

+          ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);

+      cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;

+      cpi->zrun_zbin_boost_a[q][i] =

+          ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);

+#endif

@@ -670,91 +227,63 @@

 void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {

   int i;

-  int QIndex;

   MACROBLOCKD *xd = &x->e_mbd;

   int zbin_extra;

   int segment_id = xd->mode_info_context->mbmi.segment_id;

+  const int qindex = vp9_get_qindex(xd, segment_id, cpi->common.base_qindex);

-  // Select the baseline MB Q index allowing for any segment level change.

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {

-    // Abs Value

-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)

-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

-    // Delta Value

-    else {

-      QIndex = cpi->common.base_qindex +

-               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

-      // Clamp to valid range

-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;

-    }

-  } else

-    QIndex = cpi->common.base_qindex;

   // Y

-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

-                (cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

+  zbin_extra = (cpi->common.y_dequant[qindex][1] *

+                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;

-  for (i = 0; i < 16; i++) {

-    x->block[i].quant = cpi->Y1quant[QIndex];

-    x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];

-    x->block[i].zbin = cpi->Y1zbin[QIndex];

-    x->block[i].round = cpi->Y1round[QIndex];

-    x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];

-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];

-    x->block[i].zbin_extra = (int16_t)zbin_extra;

+  x->plane[0].quant = cpi->y_quant[qindex];

+  x->plane[0].quant_shift = cpi->y_quant_shift[qindex];

+  x->plane[0].zbin = cpi->y_zbin[qindex];

+  x->plane[0].round = cpi->y_round[qindex];

+  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];

+  x->plane[0].zbin_extra = (int16_t)zbin_extra;

+  x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];

-    // Segment skip feature.

-    x->block[i].skip_block =

-      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

-  }

   // UV

-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *

-                (cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

+  zbin_extra = (cpi->common.uv_dequant[qindex][1] *

+                (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;

-  for (i = 16; i < 24; i++) {

-    x->block[i].quant = cpi->UVquant[QIndex];

-    x->block[i].quant_shift = cpi->UVquant_shift[QIndex];

-    x->block[i].zbin = cpi->UVzbin[QIndex];

-    x->block[i].round = cpi->UVround[QIndex];

-    x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];

-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];

-    x->block[i].zbin_extra = (int16_t)zbin_extra;

-    // Segment skip feature.

-    x->block[i].skip_block =

-      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

+  for (i = 1; i < 3; i++) {

+    x->plane[i].quant = cpi->uv_quant[qindex];

+    x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];

+    x->plane[i].zbin = cpi->uv_zbin[qindex];

+    x->plane[i].round = cpi->uv_round[qindex];

+    x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];

+    x->plane[i].zbin_extra = (int16_t)zbin_extra;

+    x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];

+#if CONFIG_ALPHA

+  x->plane[3].quant = cpi->a_quant[qindex];

+  x->plane[3].quant_shift = cpi->a_quant_shift[qindex];

+  x->plane[3].zbin = cpi->a_zbin[qindex];

+  x->plane[3].round = cpi->a_round[qindex];

+  x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];

+  x->plane[3].zbin_extra = (int16_t)zbin_extra;

+  x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];

+#endif

+  x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

   /* save this macroblock QIndex for vp9_update_zbin_extra() */

-  x->e_mbd.q_index = QIndex;

+  x->e_mbd.q_index = qindex;

 void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {

-  int i;

-  int QIndex = x->e_mbd.q_index;

-  int zbin_extra;

+  const int qindex = x->e_mbd.q_index;

+  const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *

+                (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;

+  const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *

+                  (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;

-  // Y

-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

-                (cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  for (i = 0; i < 16; i++) {

-    x->block[i].zbin_extra = (int16_t)zbin_extra;

-  }

-  // UV

-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *

-                (cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  for (i = 16; i < 24; i++) {

-    x->block[i].zbin_extra = (int16_t)zbin_extra;

-  }

+  x->plane[0].zbin_extra = (int16_t)y_zbin_extra;

+  x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;

+  x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;

 void vp9_frame_init_quantizer(VP9_COMP *cpi) {

@@ -770,15 +299,11 @@

   cm->base_qindex = Q;

-  // Set lossless mode

-  if (cm->base_qindex <= 4)

-    cm->base_qindex = 0;

   // if any of the delta_q values are changing update flag will

   // have to be set.

-  cm->y1dc_delta_q = 0;

-  cm->uvdc_delta_q = 0;

-  cm->uvac_delta_q = 0;

+  cm->y_dc_delta_q = 0;

+  cm->uv_dc_delta_q = 0;

+  cm->uv_ac_delta_q = 0;

   // quantizer has to be reinitialized if any delta_q changes.

   // As there are not any here for now this is inactive code.

--- a/vp9/encoder/vp9_quantize.h

+++ b/vp9/encoder/vp9_quantize.h

@@ -22,46 +22,15 @@

 #define prototype_quantize_mb(sym) \

   void (sym)(MACROBLOCK *x)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/vp9_quantize_x86.h"

-#endif

+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,

+                  TX_TYPE tx_type);

-void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_ix, TX_TYPE type);

-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx);

-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2);

-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type);

-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type);

-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx);

-void vp9_quantize_mb_4x4(MACROBLOCK *x);

-void vp9_quantize_mb_8x8(MACROBLOCK *x);

-void vp9_quantize_mbuv_4x4(MACROBLOCK *x);

-void vp9_quantize_mby_4x4(MACROBLOCK *x);

-void vp9_quantize_mby_8x8(MACROBLOCK *x);

-void vp9_quantize_mbuv_8x8(MACROBLOCK *x);

-void vp9_quantize_mb_16x16(MACROBLOCK *x);

-void vp9_quantize_mby_16x16(MACROBLOCK *x);

-void vp9_quantize_sby_32x32(MACROBLOCK *x);

-void vp9_quantize_sby_16x16(MACROBLOCK *x);

-void vp9_quantize_sby_8x8(MACROBLOCK *x);

-void vp9_quantize_sby_4x4(MACROBLOCK *x);

-void vp9_quantize_sbuv_16x16(MACROBLOCK *x);

-void vp9_quantize_sbuv_8x8(MACROBLOCK *x);

-void vp9_quantize_sbuv_4x4(MACROBLOCK *x);

-void vp9_quantize_sb64y_32x32(MACROBLOCK *x);

-void vp9_quantize_sb64y_16x16(MACROBLOCK *x);

-void vp9_quantize_sb64y_8x8(MACROBLOCK *x);

-void vp9_quantize_sb64y_4x4(MACROBLOCK *x);

-void vp9_quantize_sb64uv_32x32(MACROBLOCK *x);

-void vp9_quantize_sb64uv_16x16(MACROBLOCK *x);

-void vp9_quantize_sb64uv_8x8(MACROBLOCK *x);

-void vp9_quantize_sb64uv_4x4(MACROBLOCK *x);

+void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,

+                                     int y_blocks);

+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,

+                                int y_blocks);

+void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,

+                                int y_blocks);

 struct VP9_COMP;

 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);

--- a/vp9/encoder/vp9_ratectrl.c

+++ b/vp9/encoder/vp9_ratectrl.c

@@ -30,16 +30,6 @@

 #define MIN_BPB_FACTOR 0.005

 #define MAX_BPB_FACTOR 50

-#ifdef MODE_STATS

-extern unsigned int y_modes[VP9_YMODES];

-extern unsigned int uv_modes[VP9_UV_MODES];

-extern unsigned int b_modes[B_MODE_COUNT];

-extern unsigned int inter_y_modes[MB_MODE_COUNT];

-extern unsigned int inter_uv_modes[VP9_UV_MODES];

-extern unsigned int inter_b_modes[B_MODE_COUNT];

-#endif

 // Bits Per MB at different Q (Multiplied by 512)

 #define BPER_MB_NORMBITS    9

@@ -89,7 +79,7 @@

 // tables if and when things settle down in the experimental bitstream

 double vp9_convert_qindex_to_q(int qindex) {

   // Convert the index to a real Q value (scaled down to match old Q values)

-  return vp9_ac_yquant(qindex) / 4.0;

+  return vp9_ac_quant(qindex, 0) / 4.0;

 int vp9_gfboost_qadjust(int qindex) {

@@ -112,7 +102,7 @@

   const double q = vp9_convert_qindex_to_q(qindex);

   int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;

-  // q based adjustment to baseline enumberator

+  // q based adjustment to baseline enumerator

   enumerator += (int)(enumerator * q) >> 12;

   return (int)(0.5 + (enumerator * correction_factor / q));

@@ -132,52 +122,31 @@

   vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);

   vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);

-  vp9_copy(cc->vp9_mode_contexts, cm->fc.vp9_mode_contexts);

+  vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs);

-  vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);

-  vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);

-  vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);

+  vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob);

   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);

-  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);

-  vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);

-  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);

+  vp9_copy(cc->partition_prob, cm->fc.partition_prob);

-  // Stats

-#ifdef MODE_STATS

-  vp9_copy(cc->y_modes,       y_modes);

-  vp9_copy(cc->uv_modes,      uv_modes);

-  vp9_copy(cc->b_modes,       b_modes);

-  vp9_copy(cc->inter_y_modes,  inter_y_modes);

-  vp9_copy(cc->inter_uv_modes, inter_uv_modes);

-  vp9_copy(cc->inter_b_modes,  inter_b_modes);

-#endif

   vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);

-  vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);

-  vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);

-  vp9_copy(cc->prob_comppred, cm->prob_comppred);

+  vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);

+  vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);

+  vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob);

+  vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob);

   vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,

-             cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));

+             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));

   vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);

   vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);

-  vp9_copy(cc->coef_probs_4x4, cm->fc.coef_probs_4x4);

-  vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);

-  vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);

-  vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);

+  vp9_copy(cc->coef_probs, cm->fc.coef_probs);

   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);

-#if CONFIG_COMP_INTERINTRA_PRED

-  cc->interintra_prob = cm->fc.interintra_prob;

-#endif

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_copy(cc->nzc_probs_4x4, cm->fc.nzc_probs_4x4);

-  vp9_copy(cc->nzc_probs_8x8, cm->fc.nzc_probs_8x8);

-  vp9_copy(cc->nzc_probs_16x16, cm->fc.nzc_probs_16x16);

-  vp9_copy(cc->nzc_probs_32x32, cm->fc.nzc_probs_32x32);

-  vp9_copy(cc->nzc_pcat_probs, cm->fc.nzc_pcat_probs);

-#endif

+  vp9_copy(cc->tx_probs_8x8p, cm->fc.tx_probs_8x8p);

+  vp9_copy(cc->tx_probs_16x16p, cm->fc.tx_probs_16x16p);

+  vp9_copy(cc->tx_probs_32x32p, cm->fc.tx_probs_32x32p);

+  vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);

 void vp9_restore_coding_context(VP9_COMP *cpi) {

@@ -193,53 +162,32 @@

   vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);

   vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);

-  vp9_copy(cm->fc.vp9_mode_contexts, cc->vp9_mode_contexts);

+  vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs);

-  vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);

-  vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);

-  vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);

-  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);

+  vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob);

   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);

-  vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);

-  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);

+  vp9_copy(cm->fc.partition_prob, cc->partition_prob);

-  // Stats

-#ifdef MODE_STATS

-  vp9_copy(y_modes, cc->y_modes);

-  vp9_copy(uv_modes, cc->uv_modes);

-  vp9_copy(b_modes, cc->b_modes);

-  vp9_copy(inter_y_modes, cc->inter_y_modes);

-  vp9_copy(inter_uv_modes, cc->inter_uv_modes);

-  vp9_copy(inter_b_modes, cc->inter_b_modes);

-#endif

   vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);

-  vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);

-  vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);

-  vp9_copy(cm->prob_comppred, cc->prob_comppred);

+  vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);

+  vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);

+  vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob);

+  vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob);

   vpx_memcpy(cm->last_frame_seg_map,

              cpi->coding_context.last_frame_seg_map_copy,

-             (cm->mb_rows * cm->mb_cols));

+             (cm->mi_rows * cm->mi_cols));

   vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);

   vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);

-  vp9_copy(cm->fc.coef_probs_4x4, cc->coef_probs_4x4);

-  vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);

-  vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);

-  vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);

+  vp9_copy(cm->fc.coef_probs, cc->coef_probs);

   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);

-#if CONFIG_COMP_INTERINTRA_PRED

-  cm->fc.interintra_prob = cc->interintra_prob;

-#endif

-#if CONFIG_CODE_NONZEROCOUNT

-  vp9_copy(cm->fc.nzc_probs_4x4, cc->nzc_probs_4x4);

-  vp9_copy(cm->fc.nzc_probs_8x8, cc->nzc_probs_8x8);

-  vp9_copy(cm->fc.nzc_probs_16x16, cc->nzc_probs_16x16);

-  vp9_copy(cm->fc.nzc_probs_32x32, cc->nzc_probs_32x32);

-  vp9_copy(cm->fc.nzc_pcat_probs, cc->nzc_pcat_probs);

-#endif

+  vp9_copy(cm->fc.tx_probs_8x8p, cc->tx_probs_8x8p);

+  vp9_copy(cm->fc.tx_probs_16x16p, cc->tx_probs_16x16p);

+  vp9_copy(cm->fc.tx_probs_32x32p, cc->tx_probs_32x32p);

+  vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);

 void vp9_setup_key_frame(VP9_COMP *cpi) {

@@ -258,12 +206,11 @@

 void vp9_setup_inter_frame(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  if (cm->error_resilient_mode)

+  if (cm->error_resilient_mode || cm->intra_only)

     vp9_setup_past_independence(cm, xd);

   assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);

-  vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx],

-             sizeof(cm->fc));

+  cm->fc = cm->frame_contexts[cm->frame_context_idx];

 static int estimate_bits_at_q(int frame_kind, int q, int mbs,

@@ -300,7 +247,7 @@

-//  Do the best we can to define the parameteres for the next GF based

+//  Do the best we can to define the parameters for the next GF based

 //  on what information we have available.

//

 //  In this experimental code only two pass is supported

@@ -358,16 +305,13 @@

           (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0)

            * cpi->last_boost) / 100;

     } else {

       // If there is an active ARF at this location use the minimum

-      // bits on this frame even if it is a contructed arf.

+      // bits on this frame even if it is a constructed arf.

       // The active maximum quantizer insures that an appropriate

-      // number of bits will be spent if needed for contstructed ARFs.

+      // number of bits will be spent if needed for constructed ARFs.

       cpi->this_frame_target = 0;

-    cpi->current_gf_interval = cpi->frames_till_gf_update_due;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -13,8 +13,8 @@

 #include <math.h>

 #include <limits.h>

 #include <assert.h>

-#include "vp9/common/vp9_pragmas.h"

+#include "vp9/common/vp9_pragmas.h"

 #include "vp9/encoder/vp9_tokenize.h"

 #include "vp9/encoder/vp9_treewriter.h"

 #include "vp9/encoder/vp9_onyx_int.h"

@@ -34,7 +34,6 @@

 #include "vpx_mem/vpx_mem.h"

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/encoder/vp9_encodemv.h"

 #include "vp9/common/vp9_seg_common.h"

 #include "vp9/common/vp9_pred_common.h"

 #include "vp9/common/vp9_entropy.h"

@@ -42,33 +41,17 @@

 #include "vp9/common/vp9_mvref_common.h"

 #include "vp9/common/vp9_common.h"

-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))

 #define INVALID_MV 0x80008000

 /* Factor to weigh the rate for switchable interp filters */

 #define SWITCHABLE_INTERP_RATE_FACTOR 1

-static const int auto_speed_thresh[17] = {

-  1000,

-  200,

-  150,

-  130,

-  150,

-  125,

-  120,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  105

-};

+DECLARE_ALIGNED(16, extern const uint8_t,

+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);

+#define I4X4_PRED 0x8000

+#define SPLITMV 0x10000

 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {

   {ZEROMV,    LAST_FRAME,   NONE},

   {DC_PRED,   INTRA_FRAME,  NONE},

@@ -104,118 +87,63 @@

   {SPLITMV,   GOLDEN_FRAME, NONE},

   {SPLITMV,   ALTREF_FRAME, NONE},

-  {B_PRED,    INTRA_FRAME,  NONE},

-  {I8X8_PRED, INTRA_FRAME,  NONE},

+  {I4X4_PRED, INTRA_FRAME,  NONE},

   /* compound prediction modes */

-  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},

-  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME},

-  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME},

+  {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},

+  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},

+  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},

-  {ZEROMV,    ALTREF_FRAME, LAST_FRAME},

-  {NEARESTMV, ALTREF_FRAME, LAST_FRAME},

-  {NEARMV,    ALTREF_FRAME, LAST_FRAME},

   {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},

   {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},

   {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},

-  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME},

-  {NEWMV,     ALTREF_FRAME, LAST_FRAME  },

+  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},

   {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},

-  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME},

-  {SPLITMV,   ALTREF_FRAME, LAST_FRAME  },

+  {SPLITMV,   LAST_FRAME,   ALTREF_FRAME},

   {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},

-#if CONFIG_COMP_INTERINTRA_PRED

-  /* compound inter-intra prediction */

-  {ZEROMV,    LAST_FRAME,   INTRA_FRAME},

-  {NEARESTMV, LAST_FRAME,   INTRA_FRAME},

-  {NEARMV,    LAST_FRAME,   INTRA_FRAME},

-  {NEWMV,     LAST_FRAME,   INTRA_FRAME},

-  {ZEROMV,    GOLDEN_FRAME,   INTRA_FRAME},

-  {NEARESTMV, GOLDEN_FRAME,   INTRA_FRAME},

-  {NEARMV,    GOLDEN_FRAME,   INTRA_FRAME},

-  {NEWMV,     GOLDEN_FRAME,   INTRA_FRAME},

-  {ZEROMV,    ALTREF_FRAME,   INTRA_FRAME},

-  {NEARESTMV, ALTREF_FRAME,   INTRA_FRAME},

-  {NEARMV,    ALTREF_FRAME,   INTRA_FRAME},

-  {NEWMV,     ALTREF_FRAME,   INTRA_FRAME},

-#endif

};

-static void fill_token_costs(vp9_coeff_count *c,

-                             vp9_coeff_probs *p,

-                             int block_type_counts) {

-  int i, j, k, l;

+// The baseline rd thresholds for breaking out of the rd loop for

+// certain modes are assumed to be based on 8x8 blocks.

+// This table is used to correct for blocks size.

+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).

+static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =

+  {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};

-  for (i = 0; i < block_type_counts; i++)

-    for (j = 0; j < REF_TYPES; j++)

-      for (k = 0; k < COEF_BANDS; k++)

-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {

-          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),

-                               p[i][j][k][l],

-                               vp9_coef_tree);

-        }

-}

+#define BASE_RD_THRESH_FREQ_FACT 16

+#define MAX_RD_THRESH_FREQ_FACT 32

+#define MAX_RD_THRESH_FREQ_INC 1

-#if CONFIG_CODE_NONZEROCOUNT

-static void fill_nzc_costs(VP9_COMP *cpi, int block_size) {

-  int nzc_context, r, b, nzc, values;

-  int cost[16];

-  values = block_size * block_size + 1;

-  for (nzc_context = 0; nzc_context < MAX_NZC_CONTEXTS; ++nzc_context) {

-    for (r = 0; r < REF_TYPES; ++r) {

-      for (b = 0; b < BLOCK_TYPES; ++b) {

-        unsigned int *nzc_costs;

-        if (block_size == 4) {

-          vp9_cost_tokens(cost,

-                          cpi->common.fc.nzc_probs_4x4[nzc_context][r][b],

-                          vp9_nzc4x4_tree);

-          nzc_costs = cpi->mb.nzc_costs_4x4[nzc_context][r][b];

-        } else if (block_size == 8) {

-          vp9_cost_tokens(cost,

-                          cpi->common.fc.nzc_probs_8x8[nzc_context][r][b],

-                          vp9_nzc8x8_tree);

-          nzc_costs = cpi->mb.nzc_costs_8x8[nzc_context][r][b];

-        } else if (block_size == 16) {

-          vp9_cost_tokens(cost,

-                          cpi->common.fc.nzc_probs_16x16[nzc_context][r][b],

-                          vp9_nzc16x16_tree);

-          nzc_costs = cpi->mb.nzc_costs_16x16[nzc_context][r][b];

-        } else {

-          vp9_cost_tokens(cost,

-                          cpi->common.fc.nzc_probs_32x32[nzc_context][r][b],

-                          vp9_nzc32x32_tree);

-          nzc_costs = cpi->mb.nzc_costs_32x32[nzc_context][r][b];

-        }

-        for (nzc = 0; nzc < values; ++nzc) {

-          int e, c, totalcost = 0;

-          c = codenzc(nzc);

-          totalcost = cost[c];

-          if ((e = vp9_extranzcbits[c])) {

-            int x = nzc - vp9_basenzcvalue[c];

-            while (e--) {

-              totalcost += vp9_cost_bit(

-                  cpi->common.fc.nzc_pcat_probs[nzc_context]

-                                               [c - NZC_TOKENS_NOEXTRA][e],

-                  ((x >> e) & 1));

-            }

+static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],

+                             vp9_coeff_count (*cnoskip)[BLOCK_TYPES],

+                             vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {

+  int i, j, k, l;

+  TX_SIZE t;

+  for (t = TX_4X4; t <= TX_32X32; t++)

+    for (i = 0; i < BLOCK_TYPES; i++)

+      for (j = 0; j < REF_TYPES; j++)

+        for (k = 0; k < COEF_BANDS; k++)

+          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {

+            vp9_prob probs[ENTROPY_NODES];

+            vp9_model_to_full_probs(p[t][i][j][k][l], probs);

+            vp9_cost_tokens((int *)cnoskip[t][i][j][k][l], probs,

+                            vp9_coef_tree);

+#if CONFIG_BALANCED_COEFTREE

+            // Replace the eob node prob with a very small value so that the

+            // cost approximately equals the cost without the eob node

+            probs[1] = 1;

+            vp9_cost_tokens((int *)c[t][i][j][k][l], probs, vp9_coef_tree);

+#else

+            vp9_cost_tokens_skip((int *)c[t][i][j][k][l], probs,

+                                 vp9_coef_tree);

+            assert(c[t][i][j][k][l][DCT_EOB_TOKEN] ==

+                   cnoskip[t][i][j][k][l][DCT_EOB_TOKEN]);

+#endif

-          nzc_costs[nzc] = totalcost;

-        }

-      }

-    }

-  }

-#endif

 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,

                                 0, 0, 0, 0, 0, 0, 0, 0,

                                 0, 0, 0, 0, 0, 0, 0, 0,

@@ -236,12 +164,12 @@

   for (i = 0; i < QINDEX_RANGE; i++) {

     sad_per_bit16lut[i] =

       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);

-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);

+    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);

 static int compute_rd_mult(int qindex) {

-  int q = vp9_dc_quant(qindex, 0);

+  const int q = vp9_dc_quant(qindex, 0);

   return (11 * q * q) >> 2;

@@ -252,7 +180,7 @@

 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {

-  int q, i;

+  int q, i, bsize;

   vp9_clear_system_state();  // __asm emms;

@@ -260,7 +188,7 @@

   // for key frames, golden frames and arf frames.

   // if (cpi->common.refresh_golden_frame ||

   //     cpi->common.refresh_alt_ref_frame)

-  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);

+  qindex = clamp(qindex, 0, MAXQ);

   cpi->RDMULT = compute_rd_mult(qindex);

   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {

@@ -284,44 +212,56 @@

     cpi->RDDIV = 1;

     cpi->RDMULT /= 100;

-    for (i = 0; i < MAX_MODES; i++) {

-      if (cpi->sf.thresh_mult[i] < INT_MAX) {

-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;

-      } else {

-        cpi->rd_threshes[i] = INT_MAX;

+    for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {

+      for (i = 0; i < MAX_MODES; ++i) {

+        // Threshold here seem unecessarily harsh but fine given actual

+        // range of values used for cpi->sf.thresh_mult[]

+        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);

+        // *4 relates to the scaling of rd_thresh_block_size_factor[]

+        if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {

+          cpi->rd_threshes[bsize][i] =

+            cpi->sf.thresh_mult[i] * q *

+            rd_thresh_block_size_factor[bsize] / (4 * 100);

+        } else {

+          cpi->rd_threshes[bsize][i] = INT_MAX;

+        }

+        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];

+        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;

-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];

   } else {

     cpi->RDDIV = 100;

-    for (i = 0; i < MAX_MODES; i++) {

-      if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {

-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;

-      } else {

-        cpi->rd_threshes[i] = INT_MAX;

+    for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {

+      for (i = 0; i < MAX_MODES; i++) {

+        // Threshold here seem unecessarily harsh but fine given actual

+        // range of values used for cpi->sf.thresh_mult[]

+        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);

+        if (cpi->sf.thresh_mult[i] < thresh_max) {

+          cpi->rd_threshes[bsize][i] =

+            cpi->sf.thresh_mult[i] * q *

+            rd_thresh_block_size_factor[bsize] / 4;

+        } else {

+          cpi->rd_threshes[bsize][i] = INT_MAX;

+        }

+        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];

+        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;

-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];

-  fill_token_costs(cpi->mb.token_costs[TX_4X4],

-                   cpi->common.fc.coef_probs_4x4, BLOCK_TYPES);

-  fill_token_costs(cpi->mb.token_costs[TX_8X8],

-                   cpi->common.fc.coef_probs_8x8, BLOCK_TYPES);

-  fill_token_costs(cpi->mb.token_costs[TX_16X16],

-                   cpi->common.fc.coef_probs_16x16, BLOCK_TYPES);

-  fill_token_costs(cpi->mb.token_costs[TX_32X32],

-                   cpi->common.fc.coef_probs_32x32, BLOCK_TYPES);

-#if CONFIG_CODE_NONZEROCOUNT

-  fill_nzc_costs(cpi, 4);

-  fill_nzc_costs(cpi, 8);

-  fill_nzc_costs(cpi, 16);

-  fill_nzc_costs(cpi, 32);

-#endif

+  fill_token_costs(cpi->mb.token_costs,

+                   cpi->mb.token_costs_noskip,

+                   cpi->common.fc.coef_probs);

+  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)

+    vp9_cost_tokens(cpi->mb.partition_cost[i],

+                    cpi->common.fc.partition_prob[cpi->common.frame_type][i],

+                    vp9_partition_tree);

   /*rough estimate for costing*/

-  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;

   vp9_init_mode_costs(cpi);

   if (cpi->common.frame_type != KEY_FRAME) {

@@ -345,389 +285,136 @@

   return error;

-int vp9_mbblock_error_c(MACROBLOCK *mb) {

-  BLOCK  *be;

-  BLOCKD *bd;

-  int i, j;

-  int berror, error = 0;

-  for (i = 0; i < 16; i++) {

-    be = &mb->block[i];

-    bd = &mb->e_mbd.block[i];

-    berror = 0;

-    for (j = 0; j < 16; j++) {

-      int this_diff = be->coeff[j] - bd->dqcoeff[j];

-      berror += this_diff * this_diff;

-    }

-    error += berror;

-  }

-  return error;

-}

-int vp9_mbuverror_c(MACROBLOCK *mb) {

-  BLOCK  *be;

-  BLOCKD *bd;

-  int i, error = 0;

-  for (i = 16; i < 24; i++) {

-    be = &mb->block[i];

-    bd = &mb->e_mbd.block[i];

-    error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);

-  }

-  return error;

-}

-int vp9_uvsse(MACROBLOCK *x) {

-  uint8_t *uptr, *vptr;

-  uint8_t *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);

-  uint8_t *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);

-  int uv_stride = x->block[16].src_stride;

-  unsigned int sse1 = 0;

-  unsigned int sse2 = 0;

-  int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;

-  int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;

-  int offset;

-  int pre_stride = x->e_mbd.block[16].pre_stride;

-  if (mv_row < 0)

-    mv_row -= 1;

-  else

-    mv_row += 1;

-  if (mv_col < 0)

-    mv_col -= 1;

-  else

-    mv_col += 1;

-  mv_row /= 2;

-  mv_col /= 2;

-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);

-  uptr = x->e_mbd.pre.u_buffer + offset;

-  vptr = x->e_mbd.pre.v_buffer + offset;

-  if ((mv_row | mv_col) & 7) {

-    vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,

-                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);

-    vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,

-                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);

-    sse2 += sse1;

-  } else {

-    vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);

-    vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);

-    sse2 += sse1;

-  }

-  return sse2;

-}

 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,

-                              int ib, PLANE_TYPE type,

-                              ENTROPY_CONTEXT *a,

-                              ENTROPY_CONTEXT *l,

-                              TX_SIZE tx_size) {

+                              int plane, int block, PLANE_TYPE type,

+                              ENTROPY_CONTEXT *A,

+                              ENTROPY_CONTEXT *L,

+                              TX_SIZE tx_size,

+                              int y_blocks) {

   MACROBLOCKD *const xd = &mb->e_mbd;

   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

   int pt;

-  const int eob = xd->eobs[ib];

   int c = 0;

   int cost = 0, pad;

   const int *scan, *nb;

-  const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;

-  const int ref = mbmi->ref_frame != INTRA_FRAME;

+  const int eob = xd->plane[plane].eobs[block];

+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,

+                                           block, 16);

+  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;

   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =

       mb->token_costs[tx_size][type][ref];

-  ENTROPY_CONTEXT a_ec, l_ec;

-  ENTROPY_CONTEXT *const a1 = a +

-      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);

-  ENTROPY_CONTEXT *const l1 = l +

-      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);

+  ENTROPY_CONTEXT above_ec, left_ec;

+  TX_TYPE tx_type = DCT_DCT;

-#if CONFIG_CODE_NONZEROCOUNT

-  int nzc_context = vp9_get_nzc_context(cm, xd, ib);

-  unsigned int *nzc_cost;

-#else

   const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  vp9_prob (*coef_probs)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

-                        [ENTROPY_NODES];

-#endif

+  unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =

+      mb->token_costs_noskip[tx_size][type][ref];

   int seg_eob, default_eob;

   uint8_t token_cache[1024];

+  const uint8_t * band_translate;

   // Check for consistency of tx_size with mode info

+  assert((!type && !plane) || (type && plane));

   if (type == PLANE_TYPE_Y_WITH_DC) {

     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);

   } else {

-    TX_SIZE tx_size_uv = get_uv_tx_size(xd);

+    TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);

     assert(tx_size == tx_size_uv);

   switch (tx_size) {

     case TX_4X4: {

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_4x4(xd, ib) : DCT_DCT;

-      a_ec = *a;

-      l_ec = *l;

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];

-#else

-      coef_probs = cm->fc.coef_probs_4x4;

-#endif

+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_4x4(xd, block) : DCT_DCT;

+      above_ec = A[0] != 0;

+      left_ec = L[0] != 0;

       seg_eob = 16;

-      if (tx_type == ADST_DCT) {

-        scan = vp9_row_scan_4x4;

-      } else if (tx_type == DCT_ADST) {

-        scan = vp9_col_scan_4x4;

-      } else {

-        scan = vp9_default_zig_zag1d_4x4;

-      }

+      scan = get_scan_4x4(tx_type);

+      band_translate = vp9_coefband_trans_4x4;

       break;

     case TX_8X8: {

       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

-      a_ec = (a[0] + a[1]) != 0;

-      l_ec = (l[0] + l[1]) != 0;

-      if (tx_type == ADST_DCT) {

-        scan = vp9_row_scan_8x8;

-      } else if (tx_type == DCT_ADST) {

-        scan = vp9_col_scan_8x8;

-      } else {

-        scan = vp9_default_zig_zag1d_8x8;

-      }

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];

-#else

-      coef_probs = cm->fc.coef_probs_8x8;

-#endif

+      const int sz = 1 + b_width_log2(sb_type);

+      const int x = block & ((1 << sz) - 1), y = block - x;

+      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

+      above_ec = (A[0] + A[1]) != 0;

+      left_ec = (L[0] + L[1]) != 0;

+      scan = get_scan_8x8(tx_type);

       seg_eob = 64;

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_16X16: {

       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

-      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

-      if (tx_type == ADST_DCT) {

-        scan = vp9_row_scan_16x16;

-      } else if (tx_type == DCT_ADST) {

-        scan = vp9_col_scan_16x16;

-      } else {

-        scan = vp9_default_zig_zag1d_16x16;

-      }

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];

-#else

-      coef_probs = cm->fc.coef_probs_16x16;

-#endif

+      const int sz = 2 + b_width_log2(sb_type);

+      const int x = block & ((1 << sz) - 1), y = block - x;

+      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

+      scan = get_scan_16x16(tx_type);

       seg_eob = 256;

-      if (type == PLANE_TYPE_UV) {

-        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

-        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

-      } else {

-        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;

-        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;

-      }

+      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;

+      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_32X32:

-      scan = vp9_default_zig_zag1d_32x32;

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];

-#else

-      coef_probs = cm->fc.coef_probs_32x32;

-#endif

+      scan = vp9_default_scan_32x32;

       seg_eob = 1024;

-      if (type == PLANE_TYPE_UV) {

-        ENTROPY_CONTEXT *a2, *a3, *l2, *l3;

-        a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-        a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-        l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-        l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-        a_ec = (a[0] + a[1] + a1[0] + a1[1] +

-                a2[0] + a2[1] + a3[0] + a3[1]) != 0;

-        l_ec = (l[0] + l[1] + l1[0] + l1[1] +

-                l2[0] + l2[1] + l3[0] + l3[1]) != 0;

-      } else {

-        a_ec = (a[0] + a[1] + a[2] + a[3] +

-                a1[0] + a1[1] + a1[2] + a1[3]) != 0;

-        l_ec = (l[0] + l[1] + l[2] + l[3] +

-                l1[0] + l1[1] + l1[2] + l1[3]) != 0;

-      }

+      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;

+      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     default:

       abort();

       break;

+  assert(eob <= seg_eob);

-  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);

+  pt = combine_entropy_contexts(above_ec, left_ec);

   nb = vp9_get_coef_neighbors_handle(scan, &pad);

   default_eob = seg_eob;

-#if CONFIG_CODE_NONZEROCOUNT == 0

   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))

     seg_eob = 0;

-#endif

+  /* sanity check to ensure that we do not have spurious non-zero q values */

+  if (eob < seg_eob)

+    assert(qcoeff_ptr[scan[eob]] == 0);

-#if CONFIG_CODE_NONZEROCOUNT

-    int nzc = 0;

-#endif

-    for (; c < eob; c++) {

+    for (c = 0; c < eob; c++) {

       int v = qcoeff_ptr[scan[c]];

-      int t = vp9_dct_value_tokens_ptr[v].Token;

-#if CONFIG_CODE_NONZEROCOUNT

-      nzc += (v != 0);

-#endif

-      token_cache[c] = t;

-      cost += token_costs[get_coef_band(scan, tx_size, c)][pt][t];

-      cost += vp9_dct_value_cost_ptr[v];

-#if !CONFIG_CODE_NONZEROCOUNT

-      if (!c || token_cache[c - 1])

-        cost += vp9_cost_bit(coef_probs[type][ref]

-                                       [get_coef_band(scan, tx_size, c)]

-                                       [pt][0], 1);

-#endif

-      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);

+      int t = vp9_dct_value_tokens_ptr[v].token;

+      int band = get_coef_band(band_translate, c);

+      if (c)

+        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);

+      if (!c || token_cache[scan[c - 1]])  // do not skip eob

+        cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];

+      else

+        cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];

+      token_cache[scan[c]] = vp9_pt_energy_class[t];

-#if CONFIG_CODE_NONZEROCOUNT

-    cost += nzc_cost[nzc];

-#else

-    if (c < seg_eob)

-      cost += mb->token_costs[tx_size][type][ref]

-                             [get_coef_band(scan, tx_size, c)]

-                             [pt][DCT_EOB_TOKEN];

-#endif

+    if (c < seg_eob) {

+      if (c)

+        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);

+      cost += mb->token_costs_noskip[tx_size][type][ref]

+          [get_coef_band(band_translate, c)]

+          [pt][DCT_EOB_TOKEN];

+    }

   // is eob first coefficient;

-  pt = (c > 0);

-  *a = *l = pt;

-  if (tx_size >= TX_8X8) {

-    a[1] = l[1] = pt;

-    if (tx_size >= TX_16X16) {

-      if (type == PLANE_TYPE_UV) {

-        a1[0] = a1[1] = l1[0] = l1[1] = pt;

-      } else {

-        a[2] = a[3] = l[2] = l[3] = pt;

-        if (tx_size >= TX_32X32) {

-          a1[0] = a1[1] = a1[2] = a1[3] = pt;

-          l1[0] = l1[1] = l1[2] = l1[3] = pt;

-        }

-      }

-    }

+  for (pt = 0; pt < (1 << tx_size); pt++) {

+    A[pt] = L[pt] = c > 0;

-  return cost;

-}

-static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb) {

-  int cost = 0;

-  int b;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));

-  for (b = 0; b < 16; b++)

-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above[TX_4X4][b],

-                        tl + vp9_block2left[TX_4X4][b],

-                        TX_4X4);

   return cost;

-static void macro_block_yrd_4x4(VP9_COMMON *const cm,

-                                MACROBLOCK *mb,

-                                int *rate,

-                                int *distortion,

-                                int *skippable) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-  vp9_transform_mby_4x4(mb);

-  vp9_quantize_mby_4x4(mb);

-  *distortion = vp9_mbblock_error(mb) >> 2;

-  *rate = rdcost_mby_4x4(cm, mb);

-  *skippable = vp9_mby_is_skippable_4x4(xd);

-}

-static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb) {

-  int cost = 0;

-  int b;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context, sizeof(t_left));

-  for (b = 0; b < 16; b += 4)

-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above[TX_8X8][b],

-                        tl + vp9_block2left[TX_8X8][b],

-                        TX_8X8);

-  return cost;

-}

-static void macro_block_yrd_8x8(VP9_COMMON *const cm,

-                                MACROBLOCK *mb,

-                                int *rate,

-                                int *distortion,

-                                int *skippable) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-  vp9_transform_mby_8x8(mb);

-  vp9_quantize_mby_8x8(mb);

-  *distortion = vp9_mbblock_error(mb) >> 2;

-  *rate = rdcost_mby_8x8(cm, mb);

-  *skippable = vp9_mby_is_skippable_8x8(xd);

-}

-static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));

-  return cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);

-}

-static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb,

-                                  int *rate, int *distortion, int *skippable) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;

-  vp9_transform_mby_16x16(mb);

-  vp9_quantize_mby_16x16(mb);

-  // TODO(jingning) is it possible to quickly determine whether to force

-  //                trailing coefficients to be zero, instead of running trellis

-  //                optimization in the rate-distortion optimization loop?

-  if (mb->optimize &&

-      xd->mode_info_context->mbmi.mode < I8X8_PRED)

-    vp9_optimize_mby_16x16(cm, mb);

-  *distortion = vp9_mbblock_error(mb) >> 2;

-  *rate = rdcost_mby_16x16(cm, mb);

-  *skippable = vp9_mby_is_skippable_16x16(xd);

-}

 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,

                                      int (*r)[2], int *rate,

                                      int *d, int *distortion,

@@ -737,41 +424,34 @@

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  vp9_prob skip_prob = cm->mb_no_coeff_skip ?

-                       vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;

+  vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

   int64_t rd[TX_SIZE_MAX_SB][2];

   int n, m;

+  int s0, s1;

+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);

   for (n = TX_4X4; n <= max_txfm_size; n++) {

     r[n][1] = r[n][0];

     for (m = 0; m <= n - (n == max_txfm_size); m++) {

       if (m == n)

-        r[n][1] += vp9_cost_zero(cm->prob_tx[m]);

+        r[n][1] += vp9_cost_zero(tx_probs[m]);

       else

-        r[n][1] += vp9_cost_one(cm->prob_tx[m]);

+        r[n][1] += vp9_cost_one(tx_probs[m]);

-  if (cm->mb_no_coeff_skip) {

-    int s0, s1;

+  assert(skip_prob > 0);

+  s0 = vp9_cost_bit(skip_prob, 0);

+  s1 = vp9_cost_bit(skip_prob, 1);

-    assert(skip_prob > 0);

-    s0 = vp9_cost_bit(skip_prob, 0);

-    s1 = vp9_cost_bit(skip_prob, 1);

-    for (n = TX_4X4; n <= max_txfm_size; n++) {

-      if (s[n]) {

-        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);

-      } else {

-        rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);

-        rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);

-      }

+  for (n = TX_4X4; n <= max_txfm_size; n++) {

+    if (s[n]) {

+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);

+    } else {

+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);

+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);

-  } else {

-    for (n = TX_4X4; n <= max_txfm_size; n++) {

-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0], d[n]);

-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1], d[n]);

-    }

   if (max_txfm_size == TX_32X32 &&

@@ -780,17 +460,19 @@

         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&

         rd[TX_32X32][1] < rd[TX_4X4][1]))) {

     mbmi->txfm_size = TX_32X32;

-  } else if ( cm->txfm_mode == ALLOW_16X16 ||

-             (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) ||

-             (cm->txfm_mode == TX_MODE_SELECT &&

-              rd[TX_16X16][1] < rd[TX_8X8][1] &&

-              rd[TX_16X16][1] < rd[TX_4X4][1])) {

+  } else if (max_txfm_size >= TX_16X16 &&

+             (cm->txfm_mode == ALLOW_16X16 ||

+              cm->txfm_mode == ALLOW_32X32 ||

+              (cm->txfm_mode == TX_MODE_SELECT &&

+               rd[TX_16X16][1] < rd[TX_8X8][1] &&

+               rd[TX_16X16][1] < rd[TX_4X4][1]))) {

     mbmi->txfm_size = TX_16X16;

   } else if (cm->txfm_mode == ALLOW_8X8 ||

+             cm->txfm_mode == ALLOW_16X16 ||

+             cm->txfm_mode == ALLOW_32X32 ||

            (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {

     mbmi->txfm_size = TX_8X8;

   } else {

-    assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT);

     mbmi->txfm_size = TX_4X4;

@@ -800,13 +482,14 @@

   txfm_cache[ONLY_4X4] = rd[TX_4X4][0];

   txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];

-  txfm_cache[ALLOW_16X16] = rd[TX_16X16][0];

-  txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0];

+  txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];

+  txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];

   if (max_txfm_size == TX_32X32 &&

       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&

       rd[TX_32X32][1] < rd[TX_4X4][1])

     txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];

-  else if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])

+  else if (max_txfm_size >= TX_16X16 &&

+           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])

     txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];

   else

     txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?

@@ -813,41 +496,14 @@

                                  rd[TX_4X4][1] : rd[TX_8X8][1];

-static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                            int *distortion, int *skippable,

-                            int64_t txfm_cache[NB_TXFM_MODES]) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];

-  vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,

-                   x->block[0].src_stride);

-  macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);

-  macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);

-  macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);

-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,

-                           txfm_cache, TX_16X16);

-}

-static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {

-  const unsigned int *p = (const unsigned int *)predictor;

-  unsigned int *d = (unsigned int *)dst;

-  d[0] = p[0];

-  d[4] = p[4];

-  d[8] = p[8];

-  d[12] = p[12];

-}

-static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,

-                                int block_size, int shift) {

+static int block_error(int16_t *coeff, int16_t *dqcoeff,

+                       int block_size, int shift) {

   int i;

   int64_t error = 0;

   for (i = 0; i < block_size; i++) {

-    unsigned int this_diff = coeff[i] - dqcoeff[i];

-    error += this_diff * this_diff;

+    int this_diff = coeff[i] - dqcoeff[i];

+    error += (unsigned)this_diff * this_diff;

   error >>= shift;

@@ -854,383 +510,226 @@

   return error > INT_MAX ? INT_MAX : (int)error;

-static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int cost = 0, b;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  for (b = 0; b < 64; b++)

-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above_sb[TX_4X4][b],

-                        tl + vp9_block2left_sb[TX_4X4][b], TX_4X4);

-  return cost;

+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {

+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

+  return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,

+                     16 << (bwl + bhl), shift);

-static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,

-                                int *rate, int *distortion, int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

+static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {

+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

+  int64_t sum = 0;

+  int plane;

-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-  vp9_transform_sby_4x4(x);

-  vp9_quantize_sby_4x4(x);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);

-  *rate       = rdcost_sby_4x4(cm, x);

-  *skippable  = vp9_sby_is_skippable_4x4(xd);

+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {

+    const int subsampling = x->e_mbd.plane[plane].subsampling_x +

+                            x->e_mbd.plane[plane].subsampling_y;

+    sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,

+                       16 << (bwl + bhl - subsampling), 0);

+  }

+  sum >>= shift;

+  return sum > INT_MAX ? INT_MAX : (int)sum;

-static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int cost = 0, b;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

+struct rdcost_block_args {

+  VP9_COMMON *cm;

+  MACROBLOCK *x;

+  ENTROPY_CONTEXT t_above[16];

+  ENTROPY_CONTEXT t_left[16];

+  TX_SIZE tx_size;

+  int bw;

+  int bh;

+  int cost;

+};

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

+static void rdcost_block(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                         int ss_txfrm_size, void *arg) {

+  struct rdcost_block_args* args = arg;

+  int x_idx, y_idx;

+  MACROBLOCKD * const xd = &args->x->e_mbd;

-  for (b = 0; b < 64; b += 4)

-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above_sb[TX_8X8][b],

-                        tl + vp9_block2left_sb[TX_8X8][b], TX_8X8);

+  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,

+                           &y_idx);

-  return cost;

+  args->cost += cost_coeffs(args->cm, args->x, plane, block,

+                            xd->plane[plane].plane_type, args->t_above + x_idx,

+                            args->t_left + y_idx, args->tx_size,

+                            args->bw * args->bh);

-static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,

-                                int *rate, int *distortion, int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

+static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,

+                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {

+  MACROBLOCKD * const xd = &x->e_mbd;

+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;

+  const int bw = 1 << bwl, bh = 1 << bhl;

+  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0 };

-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-  vp9_transform_sby_8x8(x);

-  vp9_quantize_sby_8x8(x);

+  vpx_memcpy(&args.t_above, xd->plane[plane].above_context,

+             sizeof(ENTROPY_CONTEXT) * bw);

+  vpx_memcpy(&args.t_left, xd->plane[plane].left_context,

+             sizeof(ENTROPY_CONTEXT) * bh);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);

-  *rate       = rdcost_sby_8x8(cm, x);

-  *skippable  = vp9_sby_is_skippable_8x8(xd);

+  foreach_transformed_block_in_plane(xd, bsize, plane, rdcost_block, &args);

+  return args.cost;

-static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int cost = 0, b;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,

+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {

+  int cost = 0, plane;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  for (b = 0; b < 64; b += 16)

-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above_sb[TX_16X16][b],

-                        tl + vp9_block2left_sb[TX_16X16][b], TX_16X16);

+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {

+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);

+  }

   return cost;

-static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,

-                                  int *rate, int *distortion, int *skippable) {

+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,

+                                     int *rate, int *distortion, int *skippable,

+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {

   MACROBLOCKD *const xd = &x->e_mbd;

+  xd->mode_info_context->mbmi.txfm_size = tx_size;

-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;

-  vp9_transform_sby_16x16(x);

-  vp9_quantize_sby_16x16(x);

+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)

+    vp9_encode_intra_block_y(cm, x, bsize);

+  else

+    vp9_xform_quant_sby(cm, x, bsize);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);

-  *rate       = rdcost_sby_16x16(cm, x);

-  *skippable  = vp9_sby_is_skippable_16x16(xd);

+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);

+  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);

+  *skippable  = vp9_sby_is_skippable(xd, bsize);

-static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

-  MACROBLOCKD * const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);

-}

-static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

-                                  int *rate, int *distortion, int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;

-  vp9_transform_sby_32x32(x);

-  vp9_quantize_sby_32x32(x);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 0);

-  *rate       = rdcost_sby_32x32(cm, x);

-  *skippable  = vp9_sby_is_skippable_32x32(xd);

-}

 static void super_block_yrd(VP9_COMP *cpi,

                             MACROBLOCK *x, int *rate, int *distortion,

-                            int *skip,

+                            int *skip, BLOCK_SIZE_TYPE bs,

                             int64_t txfm_cache[NB_TXFM_MODES]) {

   VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];

-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);

-  super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);

-  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);

-  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);

-  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);

+  assert(bs == mbmi->sb_type);

+  if (mbmi->ref_frame[0] > INTRA_FRAME)

+    vp9_subtract_sby(x, bs);

-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,

-                           TX_SIZE_MAX_SB - 1);

-}

+  if (cpi->speed > 4) {

+    if (bs >= BLOCK_SIZE_SB32X32) {

+      mbmi->txfm_size = TX_32X32;

+    } else if (bs >= BLOCK_SIZE_MB16X16) {

+      mbmi->txfm_size = TX_16X16;

+    } else if (bs >= BLOCK_SIZE_SB8X8) {

+      mbmi->txfm_size = TX_8X8;

+    } else {

+      mbmi->txfm_size = TX_4X4;

+    }

+    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));

+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,

+                             mbmi->txfm_size);

+    return;

+  }

+  if (bs >= BLOCK_SIZE_SB32X32)

+    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],

+                             bs, TX_32X32);

+  if (bs >= BLOCK_SIZE_MB16X16)

+    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],

+                             bs, TX_16X16);

+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,

+                           TX_8X8);

+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,

+                           TX_4X4);

-static int rdcost_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int cost = 0, b;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  for (b = 0; b < 256; b++)

-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above_sb64[TX_4X4][b],

-                        tl + vp9_block2left_sb64[TX_4X4][b], TX_4X4);

-  return cost;

+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,

+                           skip, txfm_cache,

+                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)

+                           - (bs < BLOCK_SIZE_MB16X16));

-static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,

-                                  int *rate, int *distortion, int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-  vp9_transform_sb64y_4x4(x);

-  vp9_quantize_sb64y_4x4(x);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);

-  *rate       = rdcost_sb64y_4x4(cm, x);

-  *skippable  = vp9_sb64y_is_skippable_4x4(xd);

-}

-static int rdcost_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int cost = 0, b;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  for (b = 0; b < 256; b += 4)

-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above_sb64[TX_8X8][b],

-                        tl + vp9_block2left_sb64[TX_8X8][b], TX_8X8);

-  return cost;

-}

-static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,

-                                  int *rate, int *distortion, int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-  vp9_transform_sb64y_8x8(x);

-  vp9_quantize_sb64y_8x8(x);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);

-  *rate       = rdcost_sb64y_8x8(cm, x);

-  *skippable  = vp9_sb64y_is_skippable_8x8(xd);

-}

-static int rdcost_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int cost = 0, b;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  for (b = 0; b < 256; b += 16)

-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above_sb64[TX_16X16][b],

-                        tl + vp9_block2left_sb64[TX_16X16][b], TX_16X16);

-  return cost;

-}

-static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,

-                                    int *rate, int *distortion,

-                                    int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;

-  vp9_transform_sb64y_16x16(x);

-  vp9_quantize_sb64y_16x16(x);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);

-  *rate       = rdcost_sb64y_16x16(cm, x);

-  *skippable  = vp9_sb64y_is_skippable_16x16(xd);

-}

-static int rdcost_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

-  int cost = 0, b;

-  MACROBLOCKD * const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  for (b = 0; b < 256; b += 64)

-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

-                        ta + vp9_block2above_sb64[TX_32X32][b],

-                        tl + vp9_block2left_sb64[TX_32X32][b], TX_32X32);

-  return cost;

-}

-static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

-                                    int *rate, int *distortion,

-                                    int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;

-  vp9_transform_sb64y_32x32(x);

-  vp9_quantize_sb64y_32x32(x);

-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 0);

-  *rate       = rdcost_sb64y_32x32(cm, x);

-  *skippable  = vp9_sb64y_is_skippable_32x32(xd);

-}

-static void super_block_64_yrd(VP9_COMP *cpi,

-                               MACROBLOCK *x, int *rate, int *distortion,

-                               int *skip,

-                               int64_t txfm_cache[NB_TXFM_MODES]) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];

-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);

-  super_block64_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);

-  super_block64_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);

-  super_block64_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);

-  super_block64_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);

-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,

-                           TX_SIZE_MAX_SB - 1);

-}

-static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {

-  const unsigned int *p = (const unsigned int *)predictor;

-  unsigned int *d = (unsigned int *)dst;

-  d[0] = p[0];

-  d[1] = p[1];

-  d[4] = p[4];

-  d[5] = p[5];

-  d[8] = p[8];

-  d[9] = p[9];

-  d[12] = p[12];

-  d[13] = p[13];

-  d[16] = p[16];

-  d[17] = p[17];

-  d[20] = p[20];

-  d[21] = p[21];

-  d[24] = p[24];

-  d[25] = p[25];

-  d[28] = p[28];

-  d[29] = p[29];

-}

-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,

-                                     BLOCKD *b, B_PREDICTION_MODE *best_mode,

+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,

+                                     MB_PREDICTION_MODE *best_mode,

                                      int *bmode_costs,

                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

                                      int *bestrate, int *bestratey,

-                                     int *bestdistortion) {

-  B_PREDICTION_MODE mode;

+                                     int *bestdistortion,

+                                     BLOCK_SIZE_TYPE bsize) {

+  MB_PREDICTION_MODE mode;

   MACROBLOCKD *xd = &x->e_mbd;

   int64_t best_rd = INT64_MAX;

   int rate = 0;

   int distortion;

   VP9_COMMON *const cm = &cpi->common;

+  const int src_stride = x->plane[0].src.stride;

+  uint8_t *src, *dst;

+  int16_t *src_diff, *coeff;

-  ENTROPY_CONTEXT ta = *a, tempa = *a;

-  ENTROPY_CONTEXT tl = *l, templ = *l;

+  ENTROPY_CONTEXT ta[2], tempa[2];

+  ENTROPY_CONTEXT tl[2], templ[2];

   TX_TYPE tx_type = DCT_DCT;

   TX_TYPE best_tx_type = DCT_DCT;

-  /*

-   * The predictor buffer is a 2d buffer with a stride of 16.  Create

-   * a temp buffer that meets the stride requirements, but we are only

-   * interested in the left 4x4 block

-   * */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, best_predictor, 16 * 4);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);

+  int bw = 1 << b_width_log2(bsize);

+  int bh = 1 << b_height_log2(bsize);

+  int idx, idy, block;

+  DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);

-#if CONFIG_NEWBINTRAMODES

-  b->bmi.as_mode.context = vp9_find_bpred_context(xd, b);

-#endif

+  assert(ib < 4);

+  vpx_memcpy(ta, a, sizeof(ta));

+  vpx_memcpy(tl, l, sizeof(tl));

   xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-  for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {

+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {

     int64_t this_rd;

-    int ratey;

+    int ratey = 0;

-#if CONFIG_NEWBINTRAMODES

-    if (xd->frame_type == KEY_FRAME) {

-      if (mode == B_CONTEXT_PRED) continue;

-    } else {

-      if (mode >= B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS &&

-          mode < B_CONTEXT_PRED)

-        continue;

-    }

-#endif

-    b->bmi.as_mode.first = mode;

-#if CONFIG_NEWBINTRAMODES

-    rate = bmode_costs[

-        mode == B_CONTEXT_PRED ? mode - CONTEXT_PRED_REPLACEMENTS : mode];

-#else

     rate = bmode_costs[mode];

-#endif

+    distortion = 0;

-    vp9_intra4x4_predict(xd, b, mode, b->predictor);

-    vp9_subtract_b(be, b, 16);

+    vpx_memcpy(tempa, ta, sizeof(ta));

+    vpx_memcpy(templ, tl, sizeof(tl));

-    b->bmi.as_mode.first = mode;

-    tx_type = get_tx_type_4x4(xd, be - x->block);

-    if (tx_type != DCT_DCT) {

-      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

-      vp9_ht_quantize_b_4x4(x, be - x->block, tx_type);

-    } else {

-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);

-      x->quantize_b_4x4(x, be - x->block);

-    }

+    for (idy = 0; idy < bh; ++idy) {

+      for (idx = 0; idx < bw; ++idx) {

+        block = ib + idy * 2 + idx;

+        xd->mode_info_context->bmi[block].as_mode.first = mode;

+        src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,

+                                        x->plane[0].src.buf, src_stride);

+        src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,

+                                             x->plane[0].src_diff);

+        coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);

+        dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,

+                                        xd->plane[0].dst.buf,

+                                        xd->plane[0].dst.stride);

+        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,

+                             dst, xd->plane[0].dst.stride);

+        vp9_subtract_block(4, 4, src_diff, 8,

+                           src, src_stride,

+                           dst, xd->plane[0].dst.stride);

-    tempa = ta;

-    templ = tl;

+        tx_type = get_tx_type_4x4(xd, block);

+        if (tx_type != DCT_DCT) {

+          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);

+          x->quantize_b_4x4(x, block, tx_type, 16);

+        } else {

+          x->fwd_txm4x4(src_diff, coeff, 16);

+          x->quantize_b_4x4(x, block, tx_type, 16);

+        }

-    ratey = cost_coeffs(cm, x, b - xd->block,

-                        PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);

-    rate += ratey;

-    distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;

+        ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,

+                             tempa + idx, templ + idy, TX_4X4, 16);

+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,

+                                                         block, 16), 16) >> 2;

+        if (best_tx_type != DCT_DCT)

+          vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),

+                               dst, xd->plane[0].dst.stride, best_tx_type);

+        else

+          xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),

+                             dst, xd->plane[0].dst.stride);

+      }

+    }

+    rate += ratey;

     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

     if (this_rd < best_rd) {

@@ -1240,21 +739,38 @@

       best_rd = this_rd;

       *best_mode = mode;

       best_tx_type = tx_type;

-      *a = tempa;

-      *l = templ;

-      copy_predictor(best_predictor, b->predictor);

-      vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);

+      vpx_memcpy(a, tempa, sizeof(tempa));

+      vpx_memcpy(l, templ, sizeof(templ));

+      for (idy = 0; idy < bh; ++idy) {

+        for (idx = 0; idx < bw; ++idx) {

+          block = ib + idy * 2 + idx;

+          vpx_memcpy(best_dqcoeff[idy * 2 + idx],

+                     BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),

+                     sizeof(best_dqcoeff[0]));

+        }

+      }

-  b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);

-  // inverse transform

-  if (best_tx_type != DCT_DCT)

-    vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type);

-  else

-    xd->inv_txm4x4(best_dqcoeff, b->diff, 32);

+  for (idy = 0; idy < bh; ++idy) {

+    for (idx = 0; idx < bw; ++idx) {

+      block = ib + idy * 2 + idx;

+      xd->mode_info_context->bmi[block].as_mode.first = *best_mode;

+      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,

+                                      xd->plane[0].dst.buf,

+                                      xd->plane[0].dst.stride);

-  vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

+      vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,

+                           dst, xd->plane[0].dst.stride);

+      // inverse transform

+      if (best_tx_type != DCT_DCT)

+        vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,

+                             xd->plane[0].dst.stride, best_tx_type);

+      else

+        xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,

+                           xd->plane[0].dst.stride);

+    }

+  }

   return best_rd;

@@ -1262,60 +778,57 @@

 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,

                                          int *Rate, int *rate_y,

                                          int *Distortion, int64_t best_rd) {

-  int i;

+  int i, j;

   MACROBLOCKD *const xd = &mb->e_mbd;

-  int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];

+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;

+  int bw = 1 << b_width_log2(bsize);

+  int bh = 1 << b_height_log2(bsize);

+  int idx, idy;

+  int cost = 0;

   int distortion = 0;

   int tot_rate_y = 0;

   int64_t total_rd = 0;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

+  ENTROPY_CONTEXT t_above[4], t_left[4];

   int *bmode_costs;

+  MODE_INFO *const mic = xd->mode_info_context;

-  vpx_memcpy(&t_above, xd->above_context,

-             sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, xd->left_context,

-             sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));

+  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

+  bmode_costs = mb->mbmode_cost;

-  xd->mode_info_context->mbmi.mode = B_PRED;

-  bmode_costs = mb->inter_bmode_costs;

+  for (idy = 0; idy < 2; idy += bh) {

+    for (idx = 0; idx < 2; idx += bw) {

+      const int mis = xd->mode_info_stride;

+      MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

+      int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);

+      int UNINITIALIZED_IS_SAFE(d);

+      i = idy * 2 + idx;

-  for (i = 0; i < 16; i++) {

-    MODE_INFO *const mic = xd->mode_info_context;

-    const int mis = xd->mode_info_stride;

-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

+      if (xd->frame_type == KEY_FRAME) {

+        const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);

+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?

+                                     left_block_mode(mic, i) : DC_PRED;

-    if (xd->frame_type == KEY_FRAME) {

-      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);

-      const B_PREDICTION_MODE L = left_block_mode(mic, i);

+        bmode_costs  = mb->y_mode_costs[A][L];

+      }

-      bmode_costs  = mb->bmode_costs[A][L];

-    }

-#if CONFIG_NEWBINTRAMODES

-    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i);

-#endif

+      total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,

+                                        t_above + idx, t_left + idy,

+                                        &r, &ry, &d, bsize);

+      cost += r;

+      distortion += d;

+      tot_rate_y += ry;

-    total_rd += rd_pick_intra4x4block(

-                  cpi, mb, mb->block + i, xd->block + i, &best_mode,

-                  bmode_costs, ta + vp9_block2above[TX_4X4][i],

-                  tl + vp9_block2left[TX_4X4][i], &r, &ry, &d);

+      mic->bmi[i].as_mode.first = best_mode;

+      for (j = 1; j < bh; ++j)

+        mic->bmi[i + j * 2].as_mode.first = best_mode;

+      for (j = 1; j < bw; ++j)

+        mic->bmi[i + j].as_mode.first = best_mode;

-    cost += r;

-    distortion += d;

-    tot_rate_y += ry;

-    mic->bmi[i].as_mode.first = best_mode;

-#if 0  // CONFIG_NEWBINTRAMODES

-    printf("%d %d\n", mic->bmi[i].as_mode.first, mic->bmi[i].as_mode.context);

-#endif

-    if (total_rd >= best_rd)

-      break;

+      if (total_rd >= best_rd)

+        break;

+    }

   if (total_rd >= best_rd)

@@ -1324,140 +837,68 @@

   *Rate = cost;

   *rate_y = tot_rate_y;

   *Distortion = distortion;

+  xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first;

   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

-static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,

-                                      MACROBLOCK *x,

-                                      int *rate,

-                                      int *rate_tokenonly,

-                                      int *distortion,

-                                      int *skippable,

+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                                      int *rate, int *rate_tokenonly,

+                                      int *distortion, int *skippable,

+                                      BLOCK_SIZE_TYPE bsize,

                                       int64_t txfm_cache[NB_TXFM_MODES]) {

   MB_PREDICTION_MODE mode;

   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

+  MACROBLOCKD *const xd = &x->e_mbd;

   int this_rate, this_rate_tokenonly;

   int this_distortion, s;

   int64_t best_rd = INT64_MAX, this_rd;

+  TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);

+  int i;

+  int *bmode_costs = x->mbmode_cost;

-  /* Y Search for 32x32 intra prediction mode */

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    x->e_mbd.mode_info_context->mbmi.mode = mode;

-    vp9_build_intra_predictors_sby_s(&x->e_mbd);

-    super_block_yrd(cpi, x, &this_rate_tokenonly,

-                    &this_distortion, &s, txfm_cache);

-    this_rate = this_rate_tokenonly +

-                x->mbmode_cost[x->e_mbd.frame_type]

-                              [x->e_mbd.mode_info_context->mbmi.mode];

-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

-    if (this_rd < best_rd) {

-      mode_selected   = mode;

-      best_rd         = this_rd;

-      *rate           = this_rate;

-      *rate_tokenonly = this_rate_tokenonly;

-      *distortion     = this_distortion;

-      *skippable      = s;

-    }

+  if (bsize < BLOCK_SIZE_SB8X8) {

+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;

+    return best_rd;

-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;

+  for (i = 0; i < NB_TXFM_MODES; i++)

+    txfm_cache[i] = INT64_MAX;

-  return best_rd;

-}

-static int64_t rd_pick_intra_sb64y_mode(VP9_COMP *cpi,

-                                        MACROBLOCK *x,

-                                        int *rate,

-                                        int *rate_tokenonly,

-                                        int *distortion,

-                                        int *skippable,

-                                        int64_t txfm_cache[NB_TXFM_MODES]) {

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  int this_rate, this_rate_tokenonly;

-  int this_distortion, s;

-  int64_t best_rd = INT64_MAX, this_rd;

   /* Y Search for 32x32 intra prediction mode */

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+    int64_t local_txfm_cache[NB_TXFM_MODES];

+    MODE_INFO *const mic = xd->mode_info_context;

+    const int mis = xd->mode_info_stride;

+    if (cpi->common.frame_type == KEY_FRAME) {

+      const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);

+      const MB_PREDICTION_MODE L = xd->left_available ?

+                                   left_block_mode(mic, 0) : DC_PRED;

+      bmode_costs = x->y_mode_costs[A][L];

+    }

     x->e_mbd.mode_info_context->mbmi.mode = mode;

-    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);

-    super_block_64_yrd(cpi, x, &this_rate_tokenonly,

-                       &this_distortion, &s, txfm_cache);

-    this_rate = this_rate_tokenonly +

-                x->mbmode_cost[x->e_mbd.frame_type]

-                              [x->e_mbd.mode_info_context->mbmi.mode];

+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,

+                    bsize, local_txfm_cache);

+    this_rate = this_rate_tokenonly + bmode_costs[mode];

     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

     if (this_rd < best_rd) {

       mode_selected   = mode;

       best_rd         = this_rd;

+      best_tx         = x->e_mbd.mode_info_context->mbmi.txfm_size;

       *rate           = this_rate;

       *rate_tokenonly = this_rate_tokenonly;

       *distortion     = this_distortion;

       *skippable      = s;

-  }

-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;

-  return best_rd;

-}

-static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,

-                                          MACROBLOCK *x,

-                                          int *Rate,

-                                          int *rate_y,

-                                          int *Distortion,

-                                          int *skippable,

-                                          int64_t txfm_cache[NB_TXFM_MODES]) {

-  MB_PREDICTION_MODE mode;

-  TX_SIZE txfm_size = 0;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  int rate, ratey;

-  int distortion, skip;

-  int64_t best_rd = INT64_MAX;

-  int64_t this_rd;

-  int i;

-  for (i = 0; i < NB_TXFM_MODES; i++)

-    txfm_cache[i] = INT64_MAX;

-  // Y Search for 16x16 intra prediction mode

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    int64_t local_txfm_cache[NB_TXFM_MODES];

-    mbmi->mode = mode;

-    vp9_build_intra_predictors_mby(xd);

-    macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);

-    // FIXME add compoundmode cost

-    // FIXME add rate for mode2

-    rate = ratey + x->mbmode_cost[xd->frame_type][mbmi->mode];

-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-    if (this_rd < best_rd) {

-      mode_selected = mode;

-      txfm_size = mbmi->txfm_size;

-      best_rd = this_rd;

-      *Rate = rate;

-      *rate_y = ratey;

-      *Distortion = distortion;

-      *skippable = skip;

-    }

     for (i = 0; i < NB_TXFM_MODES; i++) {

       int64_t adj_rd = this_rd + local_txfm_cache[i] -

-                        local_txfm_cache[cpi->common.txfm_mode];

+                       local_txfm_cache[cpi->common.txfm_mode];

       if (adj_rd < txfm_cache[i]) {

         txfm_cache[i] = adj_rd;

@@ -1464,760 +905,55 @@

-  mbmi->txfm_size = txfm_size;

-  mbmi->mode = mode_selected;

+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;

+  x->e_mbd.mode_info_context->mbmi.txfm_size = best_tx;

   return best_rd;

-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,

-                                     B_PREDICTION_MODE *best_mode,

-                                     int *mode_costs,

-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                                     int *bestrate, int *bestratey,

-                                     int *bestdistortion) {

-  VP9_COMMON *const cm = &cpi->common;

-  MB_PREDICTION_MODE mode;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int64_t best_rd = INT64_MAX;

-  int distortion = 0, rate = 0;

-  BLOCK  *be = x->block + ib;

-  BLOCKD *b = xd->block + ib;

-  ENTROPY_CONTEXT_PLANES ta, tl;

-  ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;

-  ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;

-  /*

-   * The predictor buffer is a 2d buffer with a stride of 16.  Create

-   * a temp buffer that meets the stride requirements, but we are only

-   * interested in the left 8x8 block

-   * */

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, best_predictor, 16 * 8);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16 * 4);

-  // perform transformation of dimension 8x8

-  // note the input and output index mapping

-  int idx = (ib & 0x02) ? (ib + 2) : ib;

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    int64_t this_rd;

-    int rate_t = 0;

-    // FIXME rate for compound mode and second intrapred mode

-    rate = mode_costs[mode];

-    b->bmi.as_mode.first = mode;

-    vp9_intra8x8_predict(xd, b, mode, b->predictor);

-    vp9_subtract_4b_c(be, b, 16);

-    if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-      TX_TYPE tx_type = get_tx_type_8x8(xd, ib);

-      if (tx_type != DCT_DCT)

-        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);

-      else

-        x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);

-      x->quantize_b_8x8(x, idx, tx_type);

-      // compute quantization mse of 8x8 block

-      distortion = vp9_block_error_c((x->block + idx)->coeff,

-                                     (xd->block + idx)->dqcoeff, 64);

-      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));

-      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));

-      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx];

-      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx];

-      ta1 = ta0 + 1;

-      tl1 = tl0 + 1;

-      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,

-                           ta0, tl0, TX_8X8);

-      rate += rate_t;

-    } else {

-      static const int iblock[4] = {0, 1, 4, 5};

-      TX_TYPE tx_type;

-      int i;

-      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));

-      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));

-      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib];

-      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib];

-      ta1 = ta0 + 1;

-      tl1 = tl0 + 1;

-      distortion = 0;

-      rate_t = 0;

-      for (i = 0; i < 4; ++i) {

-        int do_two = 0;

-        b = &xd->block[ib + iblock[i]];

-        be = &x->block[ib + iblock[i]];

-        tx_type = get_tx_type_4x4(xd, ib + iblock[i]);

-        if (tx_type != DCT_DCT) {

-          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

-          vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);

-        } else if (!(i & 1) &&

-                   get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {

-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);

-          x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);

-          do_two = 1;

-        } else {

-          x->fwd_txm4x4(be->src_diff, be->coeff, 32);

-          x->quantize_b_4x4(x, ib + iblock[i]);

-        }

-        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);

-        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,

-                              i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,

-                              TX_4X4);

-        if (do_two) {

-          i++;

-          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,

-                                i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,

-                                TX_4X4);

-        }

-      }

-      b = &xd->block[ib];

-      be = &x->block[ib];

-      rate += rate_t;

-    }

-    distortion >>= 2;

-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-    if (this_rd < best_rd) {

-      *bestrate = rate;

-      *bestratey = rate_t;

-      *bestdistortion = distortion;

-      besta0 = *ta0;

-      besta1 = *ta1;

-      bestl0 = *tl0;

-      bestl1 = *tl1;

-      best_rd = this_rd;

-      *best_mode = mode;

-      copy_predictor_8x8(best_predictor, b->predictor);

-      vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);

-      vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);

-    }

-  }

-  b->bmi.as_mode.first = (*best_mode);

-  vp9_encode_intra8x8(x, ib);

-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-    a[vp9_block2above[TX_8X8][idx]]     = besta0;

-    a[vp9_block2above[TX_8X8][idx] + 1] = besta1;

-    l[vp9_block2left[TX_8X8][idx]]      = bestl0;

-    l[vp9_block2left[TX_8X8][idx] + 1]  = bestl1;

-  } else {

-    a[vp9_block2above[TX_4X4][ib]]     = besta0;

-    a[vp9_block2above[TX_4X4][ib + 1]] = besta1;

-    l[vp9_block2left[TX_4X4][ib]]      = bestl0;

-    l[vp9_block2left[TX_4X4][ib + 4]]  = bestl1;

-  }

-  return best_rd;

-}

-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,

-                                         int *Rate, int *rate_y,

-                                         int *Distortion, int64_t best_rd) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  int i, ib;

-  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];

-  int distortion = 0;

-  int tot_rate_y = 0;

-  int64_t total_rd = 0;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  int *i8x8mode_costs;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  xd->mode_info_context->mbmi.mode = I8X8_PRED;

-  i8x8mode_costs  = mb->i8x8_mode_costs;

-  for (i = 0; i < 4; i++) {

-    MODE_INFO *const mic = xd->mode_info_context;

-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

-    ib = vp9_i8x8_block[i];

-    total_rd += rd_pick_intra8x8block(

-                  cpi, mb, ib, &best_mode,

-                  i8x8mode_costs, ta, tl, &r, &ry, &d);

-    cost += r;

-    distortion += d;

-    tot_rate_y += ry;

-    mic->bmi[ib].as_mode.first = best_mode;

-  }

-  *Rate = cost;

-  *rate_y = tot_rate_y;

-  *Distortion = distortion;

-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

-}

-static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,

-                                                  int *rate, int *rate_y,

-                                                  int *distortion,

-                                                  int *mode8x8,

-                                                  int64_t best_yrd,

-                                                  int64_t *txfm_cache) {

-  VP9_COMMON *const cm = &cpi->common;

+static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,

+                                      int *rate, int *distortion,

+                                      int *skippable, BLOCK_SIZE_TYPE bsize,

+                                      TX_SIZE uv_tx_size) {

   MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

-  int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);

-  int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);

-  int64_t tmp_rd_4x4s, tmp_rd_8x8s;

-  int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;

-  int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;

+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)

+    vp9_encode_intra_block_uv(cm, x, bsize);

+  else

+    vp9_xform_quant_sbuv(cm, x, bsize);

-  mbmi->txfm_size = TX_4X4;

-  tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,

-                                         &d4x4, best_yrd);

-  mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

-  mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

-  mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

-  mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

-  mbmi->txfm_size = TX_8X8;

-  tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,

-                                         &d8x8, best_yrd);

-  txfm_cache[ONLY_4X4]  = tmp_rd_4x4;

-  txfm_cache[ALLOW_8X8] = tmp_rd_8x8;

-  txfm_cache[ALLOW_16X16] = tmp_rd_8x8;

-  tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);

-  tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);

-  txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ?

-                               tmp_rd_4x4s : tmp_rd_8x8s;

-  if (cm->txfm_mode == TX_MODE_SELECT) {

-    if (tmp_rd_4x4s < tmp_rd_8x8s) {

-      *rate = r4x4 + cost0;

-      *rate_y = tok4x4 + cost0;

-      *distortion = d4x4;

-      mbmi->txfm_size = TX_4X4;

-      tmp_rd = tmp_rd_4x4s;

-    } else {

-      *rate = r8x8 + cost1;

-      *rate_y = tok8x8 + cost1;

-      *distortion = d8x8;

-      mbmi->txfm_size = TX_8X8;

-      tmp_rd = tmp_rd_8x8s;

-      mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

-      mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

-      mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

-      mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

-    }

-  } else if (cm->txfm_mode == ONLY_4X4) {

-    *rate = r4x4;

-    *rate_y = tok4x4;

-    *distortion = d4x4;

-    mbmi->txfm_size = TX_4X4;

-    tmp_rd = tmp_rd_4x4;

-  } else {

-    *rate = r8x8;

-    *rate_y = tok8x8;

-    *distortion = d8x8;

-    mbmi->txfm_size = TX_8X8;

-    tmp_rd = tmp_rd_8x8;

-    mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

-    mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

-    mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

-    mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

-  }

-  return tmp_rd;

+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);

+  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);

+  *skippable  = vp9_sbuv_is_skippable(xd, bsize);

-static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {

-  int b;

-  int cost = 0;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)xd->above_context;

-    tl = (ENTROPY_CONTEXT *)xd->left_context;

-  }

-  for (b = 16; b < 24; b++)

-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,

-                        ta + vp9_block2above[TX_4X4][b],

-                        tl + vp9_block2left[TX_4X4][b],

-                        TX_4X4);

-  return cost;

-}

-static int64_t rd_inter16x16_uv_4x4(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                                    int *distortion, int fullpixel, int *skip,

-                                    int do_ctx_backup) {

-  vp9_transform_mbuv_4x4(x);

-  vp9_quantize_mbuv_4x4(x);

-  *rate       = rd_cost_mbuv_4x4(&cpi->common, x, do_ctx_backup);

-  *distortion = vp9_mbuverror(x) / 4;

-  *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {

-  int b;

-  int cost = 0;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;

-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;

-  }

-  for (b = 16; b < 24; b += 4)

-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,

-                        ta + vp9_block2above[TX_8X8][b],

-                        tl + vp9_block2left[TX_8X8][b], TX_8X8);

-  return cost;

-}

-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                                    int *distortion, int fullpixel, int *skip,

-                                    int do_ctx_backup) {

-  vp9_transform_mbuv_8x8(x);

-  vp9_quantize_mbuv_8x8(x);

-  *rate       = rd_cost_mbuv_8x8(&cpi->common, x, do_ctx_backup);

-  *distortion = vp9_mbuverror(x) / 4;

-  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, int backup) {

-  int b;

-  int cost = 0;

+static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,

+                             int *rate, int *distortion, int *skippable,

+                             BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT *ta, *tl;

-  if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

-    ta = (ENTROPY_CONTEXT *) &t_above;

-    tl = (ENTROPY_CONTEXT *) &t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)xd->above_context;

-    tl = (ENTROPY_CONTEXT *)xd->left_context;

-  }

-  for (b = 16; b < 24; b += 4)

-    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_UV,

-                        ta + vp9_block2above[TX_8X8][b],

-                        tl + vp9_block2left[TX_8X8][b], TX_16X16);

-  return cost;

-}

-static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,

-                                   int *rate, int *distortion, int *skip,

-                                   int backup) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  vp9_transform_sbuv_16x16(x);

-  vp9_quantize_sbuv_16x16(x);

-  *rate       = rd_cost_sbuv_16x16(cm, x, backup);

-  *distortion = vp9_sb_block_error_c(x->coeff + 1024,

-                                     xd->dqcoeff + 1024, 512, 2);

-  *skip       = vp9_sbuv_is_skippable_16x16(xd);

-}

-static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                                int *distortion, int fullpixel, int *skip) {

-  MACROBLOCKD *xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  if (mbmi->txfm_size >= TX_16X16) {

-    vp9_subtract_sbuv_s_c(x->src_diff,

-                          usrc, vsrc, src_uv_stride,

-                          udst, vdst, dst_uv_stride);

-    rd_inter32x32_uv_16x16(&cpi->common, x, rate, distortion, skip, 1);

-  } else {

-    int n, r = 0, d = 0;

-    int skippable = 1;

-    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-    ENTROPY_CONTEXT_PLANES *ta = xd->above_context;

-    ENTROPY_CONTEXT_PLANES *tl = xd->left_context;

+  if (mbmi->ref_frame[0] > INTRA_FRAME)

+    vp9_subtract_sbuv(x, bsize);

-    memcpy(t_above, xd->above_context, sizeof(t_above));

-    memcpy(t_left, xd->left_context, sizeof(t_left));

-    for (n = 0; n < 4; n++) {

-      int x_idx = n & 1, y_idx = n >> 1;

-      int d_tmp, s_tmp, r_tmp;

-      xd->above_context = ta + x_idx;

-      xd->left_context = tl + y_idx;

-      vp9_subtract_mbuv_s_c(x->src_diff,

-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                            src_uv_stride,

-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                            dst_uv_stride);

-      if (mbmi->txfm_size == TX_4X4) {

-        rd_inter16x16_uv_4x4(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);

-      } else {

-        rd_inter16x16_uv_8x8(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);

-      }

-      r += r_tmp;

-      d += d_tmp;

-      skippable = skippable && s_tmp;

-    }

-    *rate = r;

-    *distortion = d;

-    *skip = skippable;

-    xd->left_context = tl;

-    xd->above_context = ta;

-    memcpy(xd->above_context, t_above, sizeof(t_above));

-    memcpy(xd->left_context, t_left, sizeof(t_left));

-  }

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate,

-                                int *distortion, int *skip);

-static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                                int *distortion, int fullpixel, int *skip) {

-  super_block_64_uvrd(&cpi->common, x, rate, distortion, skip);

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,

-                                    MACROBLOCK *x,

-                                    int *rate,

-                                    int *rate_tokenonly,

-                                    int *distortion,

-                                    int *skippable) {

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int64_t best_rd = INT64_MAX;

-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

-  int rate_to, UNINITIALIZED_IS_SAFE(skip);

-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    int rate;

-    int distortion;

-    int64_t this_rd;

-    mbmi->uv_mode = mode;

-    vp9_build_intra_predictors_mbuv(&x->e_mbd);

-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                      x->e_mbd.predictor, x->src.uv_stride);

-    vp9_transform_mbuv_4x4(x);

-    vp9_quantize_mbuv_4x4(x);

-    rate_to = rd_cost_mbuv_4x4(&cpi->common, x, 1);

-    rate = rate_to

-           + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

-    distortion = vp9_mbuverror(x) / 4;

-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-    if (this_rd < best_rd) {

-      skip = vp9_mbuv_is_skippable_4x4(xd);

-      best_rd = this_rd;

-      d = distortion;

-      r = rate;

-      *rate_tokenonly = rate_to;

-      mode_selected = mode;

-    }

-  }

-  *rate = r;

-  *distortion = d;

-  *skippable = skip;

-  mbmi->uv_mode = mode_selected;

-}

-static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,

-                                        MACROBLOCK *x,

-                                        int *rate,

-                                        int *rate_tokenonly,

-                                        int *distortion,

-                                        int *skippable) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int64_t best_rd = INT64_MAX;

-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

-  int rate_to, UNINITIALIZED_IS_SAFE(skip);

-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    int rate;

-    int distortion;

-    int64_t this_rd;

-    mbmi->uv_mode = mode;

-    vp9_build_intra_predictors_mbuv(&x->e_mbd);

-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                      x->e_mbd.predictor, x->src.uv_stride);

-    vp9_transform_mbuv_8x8(x);

-    vp9_quantize_mbuv_8x8(x);

-    rate_to = rd_cost_mbuv_8x8(&cpi->common, x, 1);

-    rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

-    distortion = vp9_mbuverror(x) / 4;

-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-    if (this_rd < best_rd) {

-      skip = vp9_mbuv_is_skippable_8x8(xd);

-      best_rd = this_rd;

-      d = distortion;

-      r = rate;

-      *rate_tokenonly = rate_to;

-      mode_selected = mode;

-    }

-  }

-  *rate = r;

-  *distortion = d;

-  *skippable = skip;

-  mbmi->uv_mode = mode_selected;

-}

-// TODO(rbultje) very similar to rd_inter32x32_uv(), merge?

-static void super_block_uvrd(VP9_COMMON *const cm,

-                             MACROBLOCK *x,

-                             int *rate,

-                             int *distortion,

-                             int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  if (mbmi->txfm_size >= TX_16X16) {

-    vp9_subtract_sbuv_s_c(x->src_diff,

-                          usrc, vsrc, src_uv_stride,

-                          udst, vdst, dst_uv_stride);

-    rd_inter32x32_uv_16x16(cm, x, rate, distortion, skippable, 1);

+  if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {

+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,

+                              TX_32X32);

+  } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {

+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,

+                              TX_16X16);

+  } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {

+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,

+                              TX_8X8);

   } else {

-    int d = 0, r = 0, n, s = 1;

-    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-    ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;

-    ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;

-    memcpy(t_above, xd->above_context, sizeof(t_above));

-    memcpy(t_left,  xd->left_context,  sizeof(t_left));

-    for (n = 0; n < 4; n++) {

-      int x_idx = n & 1, y_idx = n >> 1;

-      vp9_subtract_mbuv_s_c(x->src_diff,

-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                            src_uv_stride,

-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                            dst_uv_stride);

-      if (mbmi->txfm_size == TX_4X4) {

-        vp9_transform_mbuv_4x4(x);

-        vp9_quantize_mbuv_4x4(x);

-        s &= vp9_mbuv_is_skippable_4x4(xd);

-      } else {

-        vp9_transform_mbuv_8x8(x);

-        vp9_quantize_mbuv_8x8(x);

-        s &= vp9_mbuv_is_skippable_8x8(xd);

-      }

-      d += vp9_mbuverror(x) >> 2;

-      xd->above_context = t_above + x_idx;

-      xd->left_context = t_left + y_idx;

-      if (mbmi->txfm_size == TX_4X4) {

-        r += rd_cost_mbuv_4x4(cm, x, 0);

-      } else {

-        r += rd_cost_mbuv_8x8(cm, x, 0);

-      }

-    }

-    xd->above_context = ta_orig;

-    xd->left_context = tl_orig;

-    *distortion = d;

-    *rate       = r;

-    *skippable  = s;

+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,

+                              TX_4X4);

-static int rd_cost_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

-                                int backup) {

-  int b;

-  int cost = 0;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT *ta, *tl;

-  if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

-    ta = (ENTROPY_CONTEXT *) &t_above;

-    tl = (ENTROPY_CONTEXT *) &t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)xd->above_context;

-    tl = (ENTROPY_CONTEXT *)xd->left_context;

-  }

-  for (b = 16; b < 24; b += 4)

-    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_UV,

-                        ta + vp9_block2above[TX_8X8][b],

-                        tl + vp9_block2left[TX_8X8][b], TX_32X32);

-  return cost;

-}

-static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

-                                   int *rate, int *distortion, int *skip,

-                                   int backup) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  vp9_transform_sb64uv_32x32(x);

-  vp9_quantize_sb64uv_32x32(x);

-  *rate       = rd_cost_sb64uv_32x32(cm, x, backup);

-  *distortion = vp9_sb_block_error_c(x->coeff + 4096,

-                                     xd->dqcoeff + 4096, 2048, 0);

-  *skip       = vp9_sb64uv_is_skippable_32x32(xd);

-}

-static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,

-                                int *rate,

-                                int *distortion,

-                                int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

-  ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;

-  ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;

-  int d = 0, r = 0, n, s = 1;

-  // FIXME not needed if tx=32x32

-  memcpy(t_above, xd->above_context, sizeof(t_above));

-  memcpy(t_left,  xd->left_context,  sizeof(t_left));

-  if (mbmi->txfm_size == TX_32X32) {

-    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,

-                            udst, vdst, dst_uv_stride);

-    rd_inter64x64_uv_32x32(cm, x, &r, &d, &s, 1);

-  } else if (mbmi->txfm_size == TX_16X16) {

-    int n;

-    *rate = 0;

-    for (n = 0; n < 4; n++) {

-      int x_idx = n & 1, y_idx = n >> 1;

-      int r_tmp, d_tmp, s_tmp;

-      vp9_subtract_sbuv_s_c(x->src_diff,

-                            usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,

-                            vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,

-                            src_uv_stride,

-                            udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,

-                            vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,

-                            dst_uv_stride);

-      xd->above_context = t_above + x_idx * 2;

-      xd->left_context = t_left + y_idx * 2;

-      rd_inter32x32_uv_16x16(cm, x, &r_tmp, &d_tmp, &s_tmp, 0);

-      r += r_tmp;

-      d += d_tmp;

-      s = s && s_tmp;

-    }

-  } else {

-    for (n = 0; n < 16; n++) {

-      int x_idx = n & 3, y_idx = n >> 2;

-      vp9_subtract_mbuv_s_c(x->src_diff,

-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                            src_uv_stride,

-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                            dst_uv_stride);

-      if (mbmi->txfm_size == TX_4X4) {

-        vp9_transform_mbuv_4x4(x);

-        vp9_quantize_mbuv_4x4(x);

-        s &= vp9_mbuv_is_skippable_4x4(xd);

-      } else {

-        vp9_transform_mbuv_8x8(x);

-        vp9_quantize_mbuv_8x8(x);

-        s &= vp9_mbuv_is_skippable_8x8(xd);

-      }

-      xd->above_context = t_above + x_idx;

-      xd->left_context = t_left + y_idx;

-      d += vp9_mbuverror(x) >> 2;

-      if (mbmi->txfm_size == TX_4X4) {

-        r += rd_cost_mbuv_4x4(cm, x, 0);

-      } else {

-        r += rd_cost_mbuv_8x8(cm, x, 0);

-      }

-    }

-  }

-  *distortion = d;

-  *rate       = r;

-  *skippable  = s;

-  xd->left_context = tl_orig;

-  xd->above_context = ta_orig;

-}

-static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,

-                                       MACROBLOCK *x,

-                                       int *rate,

-                                       int *rate_tokenonly,

-                                       int *distortion,

-                                       int *skippable) {

+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                                       int *rate, int *rate_tokenonly,

+                                       int *distortion, int *skippable,

+                                       BLOCK_SIZE_TYPE bsize) {

   MB_PREDICTION_MODE mode;

   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

   int64_t best_rd = INT64_MAX, this_rd;

@@ -2226,10 +962,8 @@

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,

-                     &this_distortion, &s);

+                     &this_distortion, &s, bsize);

     this_rate = this_rate_tokenonly +

                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];

     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

@@ -2249,43 +983,6 @@

   return best_rd;

-static int64_t rd_pick_intra_sb64uv_mode(VP9_COMP *cpi,

-                                         MACROBLOCK *x,

-                                         int *rate,

-                                         int *rate_tokenonly,

-                                         int *distortion,

-                                         int *skippable) {

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  int64_t best_rd = INT64_MAX, this_rd;

-  int this_rate_tokenonly, this_rate;

-  int this_distortion, s;

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

-    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);

-    super_block_64_uvrd(&cpi->common, x, &this_rate_tokenonly,

-                        &this_distortion, &s);

-    this_rate = this_rate_tokenonly +

-    x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];

-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

-    if (this_rd < best_rd) {

-      mode_selected   = mode;

-      best_rd         = this_rd;

-      *rate           = this_rate;

-      *rate_tokenonly = this_rate_tokenonly;

-      *distortion     = this_distortion;

-      *skippable      = s;

-    }

-  }

-  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;

-  return best_rd;

-}

 int vp9_cost_mv_ref(VP9_COMP *cpi,

                     MB_PREDICTION_MODE m,

                     const int mode_context) {

@@ -2296,11 +993,11 @@

   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

     VP9_COMMON *pc = &cpi->common;

-    vp9_prob p [VP9_MVREFS - 1];

-    assert(NEARESTMV <= m  &&  m <= SPLITMV);

+    vp9_prob p[VP9_INTER_MODES - 1];

+    assert(NEARESTMV <= m  &&  m <= NEWMV);

     vp9_mv_ref_probs(pc, p, mode_context);

-    return cost_token(vp9_mv_ref_tree, p,

-                      vp9_mv_ref_encoding_array - NEARESTMV + m);

+    return cost_token(vp9_sb_mv_ref_tree, p,

+                      vp9_sb_mv_ref_encoding_array - NEARESTMV + m);

   } else

     return 0;

@@ -2310,112 +1007,81 @@

   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;

-static int labels2mode(

-  MACROBLOCK *x,

-  int const *labelings, int which_label,

-  B_PREDICTION_MODE this_mode,

-  int_mv *this_mv, int_mv *this_second_mv,

-  int_mv seg_mvs[MAX_REF_FRAMES - 1],

-  int_mv *best_ref_mv,

-  int_mv *second_best_ref_mv,

-  int *mvjcost, int *mvcost[2]) {

+static int labels2mode(MACROBLOCK *x, int i,

+                       MB_PREDICTION_MODE this_mode,

+                       int_mv *this_mv, int_mv *this_second_mv,

+                       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],

+                       int_mv seg_mvs[MAX_REF_FRAMES],

+                       int_mv *best_ref_mv,

+                       int_mv *second_best_ref_mv,

+                       int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {

   MACROBLOCKD *const xd = &x->e_mbd;

   MODE_INFO *const mic = xd->mode_info_context;

   MB_MODE_INFO * mbmi = &mic->mbmi;

-  const int mis = xd->mode_info_stride;

+  int cost = 0, thismvcost = 0;

+  int idx, idy;

+  int bw = 1 << b_width_log2(mbmi->sb_type);

+  int bh = 1 << b_height_log2(mbmi->sb_type);

-  int i, cost = 0, thismvcost = 0;

   /* We have to be careful retrieving previously-encoded motion vectors.

-     Ones from this macroblock have to be pulled from the BLOCKD array

-     as they have not yet made it to the bmi array in our MB_MODE_INFO. */

-  for (i = 0; i < 16; ++i) {

-    BLOCKD *const d = xd->block + i;

-    const int row = i >> 2,  col = i & 3;

+   Ones from this macroblock have to be pulled from the BLOCKD array

+   as they have not yet made it to the bmi array in our MB_MODE_INFO. */

+  MB_PREDICTION_MODE m;

-    B_PREDICTION_MODE m;

-    if (labelings[i] != which_label)

-      continue;

-    if (col  &&  labelings[i] == labelings[i - 1])

-      m = LEFT4X4;

-    else if (row  &&  labelings[i] == labelings[i - 4])

-      m = ABOVE4X4;

-    else {

-      // the only time we should do costing for new motion vector or mode

-      // is when we are on a new label  (jbb May 08, 2007)

-      switch (m = this_mode) {

-        case NEW4X4 :

-          if (mbmi->second_ref_frame > 0) {

-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;

-            this_second_mv->as_int =

-              seg_mvs[mbmi->second_ref_frame - 1].as_int;

-          }

-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,

-                                        102, xd->allow_high_precision_mv);

-          if (mbmi->second_ref_frame > 0) {

-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,

-                                          mvjcost, mvcost, 102,

-                                          xd->allow_high_precision_mv);

-          }

-          break;

-        case LEFT4X4:

-          this_mv->as_int = col ? d[-1].bmi.as_mv[0].as_int :

-                                  left_block_mv(xd, mic, i);

-          if (mbmi->second_ref_frame > 0)

-            this_second_mv->as_int = col ? d[-1].bmi.as_mv[1].as_int :

-                                           left_block_second_mv(xd, mic, i);

-          break;

-        case ABOVE4X4:

-          this_mv->as_int = row ? d[-4].bmi.as_mv[0].as_int :

-                                  above_block_mv(mic, i, mis);

-          if (mbmi->second_ref_frame > 0)

-            this_second_mv->as_int = row ? d[-4].bmi.as_mv[1].as_int :

-                                           above_block_second_mv(mic, i, mis);

-          break;

-        case ZERO4X4:

-          this_mv->as_int = 0;

-          if (mbmi->second_ref_frame > 0)

-            this_second_mv->as_int = 0;

-          break;

-        default:

-          break;

+  // the only time we should do costing for new motion vector or mode

+  // is when we are on a new label  (jbb May 08, 2007)

+  switch (m = this_mode) {

+    case NEWMV:

+      this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;

+      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,

+                                    102, xd->allow_high_precision_mv);

+      if (mbmi->ref_frame[1] > 0) {

+        this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;

+        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,

+                                      mvjcost, mvcost, 102,

+                                      xd->allow_high_precision_mv);

+      break;

+    case NEARESTMV:

+      this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;

+      if (mbmi->ref_frame[1] > 0)

+        this_second_mv->as_int =

+            frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;

+      break;

+    case NEARMV:

+      this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;

+      if (mbmi->ref_frame[1] > 0)

+        this_second_mv->as_int =

+            frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;

+      break;

+    case ZEROMV:

+      this_mv->as_int = 0;

+      if (mbmi->ref_frame[1] > 0)

+        this_second_mv->as_int = 0;

+      break;

+    default:

+      break;

+  }

-      if (m == ABOVE4X4) { // replace above with left if same

-        int_mv left_mv, left_second_mv;

+  cost = vp9_cost_mv_ref(cpi, this_mode,

+                         mbmi->mb_mode_context[mbmi->ref_frame[0]]);

-        left_second_mv.as_int = 0;

-        left_mv.as_int = col ? d[-1].bmi.as_mv[0].as_int :

-                         left_block_mv(xd, mic, i);

-        if (mbmi->second_ref_frame > 0)

-          left_second_mv.as_int = col ? d[-1].bmi.as_mv[1].as_int :

-                                  left_block_second_mv(xd, mic, i);

+  mic->bmi[i].as_mv[0].as_int = this_mv->as_int;

+  if (mbmi->ref_frame[1] > 0)

+    mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;

-        if (left_mv.as_int == this_mv->as_int &&

-            (mbmi->second_ref_frame <= 0 ||

-             left_second_mv.as_int == this_second_mv->as_int))

-          m = LEFT4X4;

-      }

-#if CONFIG_NEWBINTRAMODES

-      cost = x->inter_bmode_costs[

-          m == B_CONTEXT_PRED ? m - CONTEXT_PRED_REPLACEMENTS : m];

-#else

-      cost = x->inter_bmode_costs[m];

-#endif

+  x->partition_info->bmi[i].mode = m;

+  x->partition_info->bmi[i].mv.as_int = this_mv->as_int;

+  if (mbmi->ref_frame[1] > 0)

+    x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;

+  for (idy = 0; idy < bh; ++idy) {

+    for (idx = 0; idx < bw; ++idx) {

+      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],

+                 &mic->bmi[i], sizeof(mic->bmi[i]));

+      vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],

+                 &x->partition_info->bmi[i],

+                 sizeof(x->partition_info->bmi[i]));

-    d->bmi.as_mv[0].as_int = this_mv->as_int;

-    if (mbmi->second_ref_frame > 0)

-      d->bmi.as_mv[1].as_int = this_second_mv->as_int;

-    x->partition_info->bmi[i].mode = m;

-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;

-    if (mbmi->second_ref_frame > 0)

-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;

   cost += thismvcost;

@@ -2424,203 +1090,102 @@

 static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,

                                        MACROBLOCK *x,

-                                       int const *labels,

-                                       int which_label,

+                                       int i,

                                        int *labelyrate,

                                        int *distortion,

                                        ENTROPY_CONTEXT *ta,

                                        ENTROPY_CONTEXT *tl) {

-  int i;

+  int k;

   MACROBLOCKD *xd = &x->e_mbd;

+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;

+  int bwl = b_width_log2(bsize), bw = 1 << bwl;

+  int bhl = b_height_log2(bsize), bh = 1 << bhl;

+  int idx, idy;

+  const int src_stride = x->plane[0].src.stride;

+  uint8_t* const src =

+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,

+                            x->plane[0].src.buf, src_stride);

+  int16_t* src_diff =

+  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,

+                            x->plane[0].src_diff);

+  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);

+  uint8_t* const pre =

+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,

+                            xd->plane[0].pre[0].buf,

+                            xd->plane[0].pre[0].stride);

+  uint8_t* const dst =

+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,

+                            xd->plane[0].dst.buf,

+                            xd->plane[0].dst.stride);

+  int thisdistortion = 0;

+  int thisrate = 0;

   *labelyrate = 0;

   *distortion = 0;

-  for (i = 0; i < 16; i++) {

-    if (labels[i] == which_label) {

-      BLOCKD *bd = &x->e_mbd.block[i];

-      BLOCK *be = &x->block[i];

-      int thisdistortion;

-      vp9_build_inter_predictor(*(bd->base_pre) + bd->pre,

-                                bd->pre_stride,

-                                bd->predictor, 16,

-                                &bd->bmi.as_mv[0],

-                                &xd->scale_factor[0],

-                                4, 4, 0 /* no avg */, &xd->subpix);

+  vp9_build_inter_predictor(pre,

+                            xd->plane[0].pre[0].stride,

+                            dst,

+                            xd->plane[0].dst.stride,

+                            &xd->mode_info_context->bmi[i].as_mv[0],

+                            &xd->scale_factor[0],

+                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);

-      // TODO(debargha): Make this work properly with the

-      // implicit-compoundinter-weight experiment when implicit

-      // weighting for splitmv modes is turned on.

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-        vp9_build_inter_predictor(

-            *(bd->base_second_pre) + bd->pre, bd->pre_stride, bd->predictor, 16,

-            &bd->bmi.as_mv[1], &xd->scale_factor[1], 4, 4,

-            1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT) /* avg */,

-            &xd->subpix);

-      }

-      vp9_subtract_b(be, bd, 16);

-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);

-      x->quantize_b_4x4(x, i);

-      thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);

-      *distortion += thisdistortion;

-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,

-                                 ta + vp9_block2above[TX_4X4][i],

-                                 tl + vp9_block2left[TX_4X4][i], TX_4X4);

-    }

+  // TODO(debargha): Make this work properly with the

+  // implicit-compoundinter-weight experiment when implicit

+  // weighting for splitmv modes is turned on.

+  if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {

+    uint8_t* const second_pre =

+    raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,

+                              xd->plane[0].pre[1].buf,

+                              xd->plane[0].pre[1].stride);

+    vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,

+                              dst, xd->plane[0].dst.stride,

+                              &xd->mode_info_context->bmi[i].as_mv[1],

+                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,

+                              &xd->subpix);

-  *distortion >>= 2;

-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);

-}

-static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,

-                                           MACROBLOCK *x,

-                                           int const *labels,

-                                           int which_label,

-                                           int *labelyrate,

-                                           int *distortion,

-                                           int64_t *otherrd,

-                                           ENTROPY_CONTEXT *ta,

-                                           ENTROPY_CONTEXT *tl) {

-  int i, j;

-  MACROBLOCKD *xd = &x->e_mbd;

-  const int iblock[4] = { 0, 1, 4, 5 };

-  int othercost = 0, otherdist = 0;

-  ENTROPY_CONTEXT_PLANES tac, tlc;

-  ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,

-                  *tlcp = (ENTROPY_CONTEXT *) &tlc;

+  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,

+                     src, src_stride,

+                     dst, xd->plane[0].dst.stride);

-  if (otherrd) {

-    memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));

-    memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));

-  }

-  *distortion = 0;

-  *labelyrate = 0;

-  for (i = 0; i < 4; i++) {

-    int ib = vp9_i8x8_block[i];

-    if (labels[ib] == which_label) {

-      const int use_second_ref =

-          xd->mode_info_context->mbmi.second_ref_frame > 0;

-      int which_mv;

-      int idx = (ib & 8) + ((ib & 2) << 1);

-      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];

-      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];

-      int thisdistortion;

-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

-        uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre;

-        // TODO(debargha): Make this work properly with the

-        // implicit-compoundinter-weight experiment when implicit

-        // weighting for splitmv modes is turned on.

-        vp9_build_inter_predictor(

-            *base_pre + bd->pre, bd->pre_stride, bd->predictor, 16,

-            &bd->bmi.as_mv[which_mv], &xd->scale_factor[which_mv], 8, 8,

-            which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

-            &xd->subpix);

-      }

-      vp9_subtract_4b_c(be, bd, 16);

-      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {

-        if (otherrd) {

-          x->fwd_txm8x8(be->src_diff, be2->coeff, 32);

-          x->quantize_b_8x8(x, idx, DCT_DCT);

-          thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

-          otherdist += thisdistortion;

-          xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,

-                                   tacp + vp9_block2above[TX_8X8][idx],

-                                   tlcp + vp9_block2left[TX_8X8][idx],

-                                   TX_8X8);

-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-        }

-        for (j = 0; j < 4; j += 2) {

-          bd = &xd->block[ib + iblock[j]];

-          be = &x->block[ib + iblock[j]];

-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);

-          x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);

-          thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);

-          *distortion += thisdistortion;

-          *labelyrate +=

-              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,

-                          ta + vp9_block2above[TX_4X4][ib + iblock[j]],

-                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],

-                          TX_4X4);

-          *labelyrate +=

-              cost_coeffs(cm, x, ib + iblock[j] + 1,

-                          PLANE_TYPE_Y_WITH_DC,

-                          ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],

-                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],

-                          TX_4X4);

-        }

-      } else /* 8x8 */ {

-        if (otherrd) {

-          for (j = 0; j < 4; j += 2) {

-            BLOCKD *bd = &xd->block[ib + iblock[j]];

-            BLOCK *be = &x->block[ib + iblock[j]];

-            x->fwd_txm8x4(be->src_diff, be->coeff, 32);

-            x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]);

-            thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);

-            otherdist += thisdistortion;

-            xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-            othercost +=

-                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,

-                            tacp + vp9_block2above[TX_4X4][ib + iblock[j]],

-                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],

-                            TX_4X4);

-            othercost +=

-                cost_coeffs(cm, x, ib + iblock[j] + 1,

-                            PLANE_TYPE_Y_WITH_DC,

-                            tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],

-                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],

-                            TX_4X4);

-            xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-          }

-        }

-        x->fwd_txm8x8(be->src_diff, be2->coeff, 32);

-        x->quantize_b_8x8(x, idx, DCT_DCT);

-        thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

-        *distortion += thisdistortion;

-        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,

-                                   ta + vp9_block2above[TX_8X8][idx],

-                                   tl + vp9_block2left[TX_8X8][idx], TX_8X8);

-      }

+  k = i;

+  for (idy = 0; idy < bh; ++idy) {

+    for (idx = 0; idx < bw; ++idx) {

+      k += (idy * 2 + idx);

+      src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,

+                                           x->plane[0].src_diff);

+      coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);

+      x->fwd_txm4x4(src_diff, coeff, 16);

+      x->quantize_b_4x4(x, k, DCT_DCT, 16);

+      thisdistortion += vp9_block_error(coeff,

+                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,

+                                                     k, 16), 16);

+      thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,

+                              ta + (k & 1),

+                              tl + (k >> 1), TX_4X4, 16);

+  *distortion += thisdistortion;

+  *labelyrate += thisrate;

   *distortion >>= 2;

-  if (otherrd) {

-    otherdist >>= 2;

-    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);

-  }

   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);

-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};

 typedef struct {

   int_mv *ref_mv, *second_ref_mv;

   int_mv mvp;

   int64_t segment_rd;

-  SPLITMV_PARTITIONING_TYPE segment_num;

-  TX_SIZE txfm_size;

   int r;

   int d;

   int segment_yrate;

-  B_PREDICTION_MODE modes[16];

-  int_mv mvs[16], second_mvs[16];

-  int eobs[16];

+  MB_PREDICTION_MODE modes[4];

+  int_mv mvs[4], second_mvs[4];

+  int eobs[4];

   int mvthresh;

-  int *mdcounts;

-  int_mv sv_mvp[4];     // save 4 mvp from 8x8

-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16

 } BEST_SEG_INFO;

 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {

@@ -2632,47 +1197,113 @@

   return r;

+static enum BlockSize get_block_size(int bw, int bh) {

+  if (bw == 4 && bh == 4)

+    return BLOCK_4X4;

+  if (bw == 4 && bh == 8)

+    return BLOCK_4X8;

+  if (bw == 8 && bh == 4)

+    return BLOCK_8X4;

+  if (bw == 8 && bh == 8)

+    return BLOCK_8X8;

+  if (bw == 8 && bh == 16)

+    return BLOCK_8X16;

+  if (bw == 16 && bh == 8)

+    return BLOCK_16X8;

+  if (bw == 16 && bh == 16)

+    return BLOCK_16X16;

+  if (bw == 32 && bh == 32)

+    return BLOCK_32X32;

+  if (bw == 32 && bh == 16)

+    return BLOCK_32X16;

+  if (bw == 16 && bh == 32)

+    return BLOCK_16X32;

+  if (bw == 64 && bh == 32)

+    return BLOCK_64X32;

+  if (bw == 32 && bh == 64)

+    return BLOCK_32X64;

+  if (bw == 64 && bh == 64)

+    return BLOCK_64X64;

+  assert(0);

+  return -1;

+}

+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {

+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;

+  x->plane[0].src.buf =

+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,

+                                x->plane[0].src.buf,

+                                x->plane[0].src.stride);

+  assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);

+  x->e_mbd.plane[0].pre[0].buf =

+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,

+                                x->e_mbd.plane[0].pre[0].buf,

+                                x->e_mbd.plane[0].pre[0].stride);

+  if (mbmi->ref_frame[1])

+    x->e_mbd.plane[0].pre[1].buf =

+        raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,

+                                  x->e_mbd.plane[0].pre[1].buf,

+                                  x->e_mbd.plane[0].pre[1].stride);

+}

+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,

+                                  struct buf_2d orig_pre[2]) {

+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;

+  x->plane[0].src = orig_src;

+  x->e_mbd.plane[0].pre[0] = orig_pre[0];

+  if (mbmi->ref_frame[1])

+    x->e_mbd.plane[0].pre[1] = orig_pre[1];

+}

+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,

+                                    BLOCK_SIZE_TYPE bsize,

+                                    int_mv *frame_mv,

+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,

+                                    int mi_row, int mi_col,

+                                    int_mv single_newmv[MAX_REF_FRAMES]);

 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,

                                     BEST_SEG_INFO *bsi,

-                                    SPLITMV_PARTITIONING_TYPE segmentation,

-                                    TX_SIZE tx_size, int64_t *otherrds,

-                                    int64_t *rds, int *completed,

-                                    /* 16 = n_blocks */

-                                    int_mv seg_mvs[16 /* n_blocks */]

-                                                  [MAX_REF_FRAMES - 1]) {

+                                    int_mv seg_mvs[4][MAX_REF_FRAMES],

+                                    int mi_row, int mi_col) {

   int i, j;

-  int const *labels;

   int br = 0, bd = 0;

-  B_PREDICTION_MODE this_mode;

+  MB_PREDICTION_MODE this_mode;

   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int label_count;

+  const int label_count = 4;

   int64_t this_segment_rd = 0, other_segment_rd;

   int label_mv_thresh;

   int rate = 0;

   int sbr = 0, sbd = 0;

   int segmentyrate = 0;

-  int best_eobs[16] = { 0 };

+  int best_eobs[4] = { 0 };

+  BLOCK_SIZE_TYPE bsize = mbmi->sb_type;

+  int bwl = b_width_log2(bsize), bw = 1 << bwl;

+  int bhl = b_height_log2(bsize), bh = 1 << bhl;

+  int idx, idy;

   vp9_variance_fn_ptr_t *v_fn_ptr;

+  YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};

+  ENTROPY_CONTEXT t_above[4], t_left[4];

+  ENTROPY_CONTEXT t_above_b[4], t_left_b[4];

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;

-  ENTROPY_CONTEXT *ta_b, *tl_b;

+  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));

+  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)];

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;

-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;

-  v_fn_ptr = &cpi->fn_ptr[segmentation];

-  labels = vp9_mbsplits[segmentation];

-  label_count = vp9_mbsplit_count[segmentation];

   // 64 makes this threshold really big effectively

   // making it so that we very rarely check mvs on

   // segments.   setting this to 1 would make mv thresh

@@ -2680,206 +1311,195 @@

   label_mv_thresh = 1 * bsi->mvthresh / label_count;

   // Segmentation method overheads

-  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,

-                    vp9_mbsplit_encodings + segmentation);

-  rate += vp9_cost_mv_ref(cpi, SPLITMV,

-                          mbmi->mb_mode_context[mbmi->ref_frame]);

-  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);

-  br += rate;

   other_segment_rd = this_segment_rd;

-  mbmi->txfm_size = tx_size;

-  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {

-    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];

-    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;

-    B_PREDICTION_MODE mode_selected = ZERO4X4;

-    int bestlabelyrate = 0;

+  for (idy = 0; idy < 2; idy += bh) {

+    for (idx = 0; idx < 2; idx += bw) {

+      // TODO(jingning,rbultje): rewrite the rate-distortion optimization

+      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop

+      int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];

+      int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

+      int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;

+      MB_PREDICTION_MODE mode_selected = ZEROMV;

+      int bestlabelyrate = 0;

+      i = idy * 2 + idx;

-    // search for the best motion vector on this segment

-    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {

-      int64_t this_rd, other_rd;

-      int distortion;

-      int labelyrate;

-      ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;

-      ENTROPY_CONTEXT *ta_s;

-      ENTROPY_CONTEXT *tl_s;

+      frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;

+      frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;

+      vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,

+                                    &frame_mv[NEARESTMV][mbmi->ref_frame[0]],

+                                    &frame_mv[NEARMV][mbmi->ref_frame[0]],

+                                    i, 0);

+      if (mbmi->ref_frame[1] > 0)

+        vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,

+                                   &frame_mv[NEARESTMV][mbmi->ref_frame[1]],

+                                   &frame_mv[NEARMV][mbmi->ref_frame[1]],

+                                   i, 1);

-      vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));

-      vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));

+      // search for the best motion vector on this segment

+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {

+        int64_t this_rd;

+        int distortion;

+        int labelyrate;

+        ENTROPY_CONTEXT t_above_s[4], t_left_s[4];

+        const struct buf_2d orig_src = x->plane[0].src;

+        struct buf_2d orig_pre[2];

-      ta_s = (ENTROPY_CONTEXT *)&t_above_s;

-      tl_s = (ENTROPY_CONTEXT *)&t_left_s;

+        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));

-      // motion search for newmv (single predictor case only)

-      if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {

-        int sseshift, n;

-        int step_param = 0;

-        int further_steps;

-        int thissme, bestsme = INT_MAX;

-        BLOCK *c;

-        BLOCKD *e;

+        vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));

+        vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));

-        /* Is the best so far sufficiently good that we cant justify doing

-         * and new motion search. */

-        if (best_label_rd < label_mv_thresh)

-          break;

+        // motion search for newmv (single predictor case only)

+        if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV) {

+          int step_param = 0;

+          int further_steps;

+          int thissme, bestsme = INT_MAX;

+          int sadpb = x->sadperbit4;

+          int_mv mvp_full;

-        if (cpi->compressor_speed) {

-          if (segmentation == PARTITIONING_8X16 ||

-              segmentation == PARTITIONING_16X8) {

-            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;

-            if (i == 1 && segmentation == PARTITIONING_16X8)

-              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;

+          /* Is the best so far sufficiently good that we cant justify doing

+           * and new motion search. */

+          if (best_label_rd < label_mv_thresh)

+            break;

-            step_param = bsi->sv_istep[i];

+          if (cpi->compressor_speed) {

+            // use previous block's result as next block's MV predictor.

+            if (i > 0) {

+              bsi->mvp.as_int =

+              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;

+              if (i == 2)

+                bsi->mvp.as_int =

+                x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;

+              step_param = 2;

+            }

-          // use previous block's result as next block's MV predictor.

-          if (segmentation == PARTITIONING_4X4 && i > 0) {

-            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv[0].as_int;

-            if (i == 4 || i == 8 || i == 12)

-              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv[0].as_int;

-            step_param = 2;

-          }

-        }

+          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

-        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

-        {

-          int sadpb = x->sadperbit4;

-          int_mv mvp_full;

           mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;

           mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;

-          // find first label

-          n = vp9_mbsplit_offset[segmentation][i];

-          c = &x->block[n];

-          e = &x->e_mbd.block[n];

-          bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,

+          // adjust src pointer for this block

+          mi_buf_shift(x, i);

+          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,

                                            sadpb, further_steps, 0, v_fn_ptr,

-                                           bsi->ref_mv, &mode_mv[NEW4X4]);

+                                           bsi->ref_mv, &mode_mv[NEWMV]);

-          sseshift = segmentation_to_sseshift[segmentation];

           // Should we do a full search (best quality only)

-          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {

+          if (cpi->compressor_speed == 0) {

             /* Check if mvp_full is within the range. */

             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,

                      x->mv_row_min, x->mv_row_max);

-            thissme = cpi->full_search_sad(x, c, e, &mvp_full,

+            thissme = cpi->full_search_sad(x, &mvp_full,

                                            sadpb, 16, v_fn_ptr,

                                            x->nmvjointcost, x->mvcost,

-                                           bsi->ref_mv);

+                                           bsi->ref_mv, i);

             if (thissme < bestsme) {

               bestsme = thissme;

-              mode_mv[NEW4X4].as_int = e->bmi.as_mv[0].as_int;

+              mode_mv[NEWMV].as_int =

+                  x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;

             } else {

               /* The full search result is actually worse so re-instate the

                * previous best vector */

-              e->bmi.as_mv[0].as_int = mode_mv[NEW4X4].as_int;

+              x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =

+                  mode_mv[NEWMV].as_int;

-        }

-        if (bestsme < INT_MAX) {

-          int distortion;

-          unsigned int sse;

-          cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],

-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,

-                                       x->nmvjointcost, x->mvcost,

-                                       &distortion, &sse);

+          if (bestsme < INT_MAX) {

+            int distortion;

+            unsigned int sse;

+            cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],

+                                         bsi->ref_mv, x->errorperbit, v_fn_ptr,

+                                         x->nmvjointcost, x->mvcost,

+                                         &distortion, &sse);

-          // safe motion search result for use in compound prediction

-          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;

+            // safe motion search result for use in compound prediction

+            seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;

+          }

+          // restore src pointers

+          mi_buf_restore(x, orig_src, orig_pre);

+        } else if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV) {

+          if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||

+              seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)

+            continue;

+          // adjust src pointers

+          mi_buf_shift(x, i);

+          if (cpi->sf.comp_inter_joint_search_thresh < bsize) {

+            iterative_motion_search(cpi, x, bsize, frame_mv[this_mode],

+                                    scaled_ref_frame,

+                                    mi_row, mi_col, seg_mvs[i]);

+            seg_mvs[i][mbmi->ref_frame[0]].as_int =

+                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;

+            seg_mvs[i][mbmi->ref_frame[1]].as_int =

+                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;

+          }

+          // restore src pointers

+          mi_buf_restore(x, orig_src, orig_pre);

-      } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {

-        /* NEW4X4 */

-        /* motion search not completed? Then skip newmv for this block with

-         * comppred */

-        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||

-            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {

+        rate = labels2mode(x, i, this_mode, &mode_mv[this_mode],

+                           &second_mode_mv[this_mode], frame_mv, seg_mvs[i],

+                           bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,

+                           x->mvcost, cpi);

+        // Trap vectors that reach beyond the UMV borders

+        if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||

+            ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||

+            ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||

+            ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {

           continue;

-      }

+        if (mbmi->ref_frame[1] > 0 &&

+            mv_check_bounds(x, &second_mode_mv[this_mode]))

+          continue;

-      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],

-                         &second_mode_mv[this_mode], seg_mvs[i],

-                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,

-                         x->mvcost);

-      // Trap vectors that reach beyond the UMV borders

-      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||

-          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||

-          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||

-          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {

-        continue;

-      }

-      if (mbmi->second_ref_frame > 0 &&

-          mv_check_bounds(x, &second_mode_mv[this_mode]))

-        continue;

-      if (segmentation == PARTITIONING_4X4) {

         this_rd = encode_inter_mb_segment(&cpi->common,

-                                          x, labels, i, &labelyrate,

-                                          &distortion, ta_s, tl_s);

-        other_rd = this_rd;

-      } else {

-        this_rd = encode_inter_mb_segment_8x8(&cpi->common,

-                                              x, labels, i, &labelyrate,

-                                              &distortion, &other_rd,

-                                              ta_s, tl_s);

-      }

-      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);

-      rate += labelyrate;

+                                          x, i, &labelyrate,

+                                          &distortion, t_above_s, t_left_s);

+        this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);

+        rate += labelyrate;

-      if (this_rd < best_label_rd) {

-        sbr = rate;

-        sbd = distortion;

-        bestlabelyrate = labelyrate;

-        mode_selected = this_mode;

-        best_label_rd = this_rd;

-        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {

-          for (j = 0; j < 16; j++)

-            if (labels[j] == i)

-              best_eobs[j] = x->e_mbd.eobs[j];

-        } else {

-          for (j = 0; j < 4; j++) {

-            int ib = vp9_i8x8_block[j], idx = j * 4;

-            if (labels[ib] == i)

-              best_eobs[idx] = x->e_mbd.eobs[idx];

-          }

+        if (this_rd < best_label_rd) {

+          sbr = rate;

+          sbd = distortion;

+          bestlabelyrate = labelyrate;

+          mode_selected = this_mode;

+          best_label_rd = this_rd;

+          best_eobs[i] = x->e_mbd.plane[0].eobs[i];

+          vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));

+          vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));

-        if (other_rd < best_other_rd)

-          best_other_rd = other_rd;

+      } /*for each 4x4 mode*/

-        vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));

-        vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));

+      vpx_memcpy(t_above, t_above_b, sizeof(t_above));

+      vpx_memcpy(t_left, t_left_b, sizeof(t_left));

-      }

-    } /*for each 4x4 mode*/

+      labels2mode(x, i, mode_selected, &mode_mv[mode_selected],

+                  &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],

+                  bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,

+                  x->mvcost, cpi);

-    vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));

+      br += sbr;

+      bd += sbd;

+      segmentyrate += bestlabelyrate;

+      this_segment_rd += best_label_rd;

+      other_segment_rd += best_other_rd;

-    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],

-                &second_mode_mv[mode_selected], seg_mvs[i],

-                bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, x->mvcost);

-    br += sbr;

-    bd += sbd;

-    segmentyrate += bestlabelyrate;

-    this_segment_rd += best_label_rd;

-    other_segment_rd += best_other_rd;

-    if (rds)

-      rds[i] = this_segment_rd;

-    if (otherrds)

-      otherrds[i] = other_segment_rd;

+      for (j = 1; j < bh; ++j)

+        vpx_memcpy(&x->partition_info->bmi[i + j * 2],

+                   &x->partition_info->bmi[i],

+                   sizeof(x->partition_info->bmi[i]));

+      for (j = 1; j < bw; ++j)

+        vpx_memcpy(&x->partition_info->bmi[i + j],

+                   &x->partition_info->bmi[i],

+                   sizeof(x->partition_info->bmi[i]));

+    }

   } /* for each label */

   if (this_segment_rd < bsi->segment_rd) {

@@ -2887,152 +1507,33 @@

     bsi->d = bd;

     bsi->segment_yrate = segmentyrate;

     bsi->segment_rd = this_segment_rd;

-    bsi->segment_num = segmentation;

-    bsi->txfm_size = mbmi->txfm_size;

     // store everything needed to come back to this!!

-    for (i = 0; i < 16; i++) {

+    for (i = 0; i < 4; i++) {

       bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;

-      if (mbmi->second_ref_frame > 0)

+      if (mbmi->ref_frame[1] > 0)

         bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;

       bsi->modes[i] = x->partition_info->bmi[i].mode;

       bsi->eobs[i] = best_eobs[i];

-  if (completed) {

-    *completed = i;

-  }

-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,

-                             BEST_SEG_INFO *bsi,

-                             unsigned int segmentation,

-                             /* 16 = n_blocks */

-                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],

-                             int64_t txfm_cache[NB_TXFM_MODES]) {

-  int i, n, c = vp9_mbsplit_count[segmentation];

-  if (segmentation == PARTITIONING_4X4) {

-    int64_t rd[16];

-    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,

-                            rd, &n, seg_mvs);

-    if (n == c) {

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        if (rd[c - 1] < txfm_cache[i])

-          txfm_cache[i] = rd[c - 1];

-      }

-    }

-  } else {

-    int64_t diff, base_rd;

-    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);

-    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);

-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {

-      int64_t rd4x4[4], rd8x8[4];

-      int n4x4, n8x8, nmin;

-      BEST_SEG_INFO bsi4x4, bsi8x8;

-      /* factor in cost of cost4x4/8x8 in decision */

-      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));

-      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));

-      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,

-                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);

-      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,

-                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);

-      if (bsi4x4.segment_num == segmentation) {

-        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);

-        if (bsi4x4.segment_rd < bsi->segment_rd)

-          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));

-      }

-      if (bsi8x8.segment_num == segmentation) {

-        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);

-        if (bsi8x8.segment_rd < bsi->segment_rd)

-          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));

-      }

-      n = n4x4 > n8x8 ? n4x4 : n8x8;

-      if (n == c) {

-        nmin = n4x4 < n8x8 ? n4x4 : n8x8;

-        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];

-        if (n == n4x4) {

-          base_rd = rd4x4[c - 1];

-        } else {

-          base_rd = rd8x8[c - 1] - diff;

-        }

-      }

-    } else {

-      int64_t rd[4], otherrd[4];

-      if (cpi->common.txfm_mode == ONLY_4X4) {

-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,

-                                rd, &n, seg_mvs);

-        if (n == c) {

-          base_rd = rd[c - 1];

-          diff = otherrd[c - 1] - rd[c - 1];

-        }

-      } else /* use 8x8 transform */ {

-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,

-                                rd, &n, seg_mvs);

-        if (n == c) {

-          diff = rd[c - 1] - otherrd[c - 1];

-          base_rd = otherrd[c - 1];

-        }

-      }

-    }

-    if (n == c) {

-      if (base_rd < txfm_cache[ONLY_4X4]) {

-        txfm_cache[ONLY_4X4] = base_rd;

-      }

-      if (base_rd + diff < txfm_cache[ALLOW_8X8]) {

-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] =

-            txfm_cache[ALLOW_32X32] = base_rd + diff;

-      }

-      if (diff < 0) {

-        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);

-      } else {

-        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);

-      }

-      if (base_rd < txfm_cache[TX_MODE_SELECT]) {

-        txfm_cache[TX_MODE_SELECT] = base_rd;

-      }

-    }

-  }

-}

-static INLINE void cal_step_param(int sr, int *sp) {

-  int step = 0;

-  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;

-  else if (sr < 1) sr = 1;

-  while (sr >>= 1)

-    step++;

-  *sp = MAX_MVSEARCH_STEPS - 1 - step;

-}

 static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,

                                        int_mv *best_ref_mv,

                                        int_mv *second_best_ref_mv,

                                        int64_t best_rd,

-                                       int *mdcounts,

                                        int *returntotrate,

                                        int *returnyrate,

                                        int *returndistortion,

                                        int *skippable, int mvthresh,

-                                       int_mv seg_mvs[NB_PARTITIONINGS]

-                                                     [16 /* n_blocks */]

-                                                     [MAX_REF_FRAMES - 1],

-                                       int64_t txfm_cache[NB_TXFM_MODES]) {

+                                       int_mv seg_mvs[4][MAX_REF_FRAMES],

+                                       int mi_row, int mi_col) {

   int i;

   BEST_SEG_INFO bsi;

   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

   vpx_memset(&bsi, 0, sizeof(bsi));

-  for (i = 0; i < NB_TXFM_MODES; i++)

-    txfm_cache[i] = INT64_MAX;

   bsi.segment_rd = best_rd;

   bsi.ref_mv = best_ref_mv;

@@ -3039,126 +1540,43 @@

   bsi.second_ref_mv = second_best_ref_mv;

   bsi.mvp.as_int = best_ref_mv->as_int;

   bsi.mvthresh = mvthresh;

-  bsi.mdcounts = mdcounts;

-  bsi.txfm_size = TX_4X4;

-  for (i = 0; i < 16; i++)

-    bsi.modes[i] = ZERO4X4;

+  for (i = 0; i < 4; i++)

+    bsi.modes[i] = ZEROMV;

-  if (cpi->compressor_speed == 0) {

-    /* for now, we will keep the original segmentation order

-       when in best quality mode */

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,

-                     seg_mvs[PARTITIONING_16X8], txfm_cache);

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,

-                     seg_mvs[PARTITIONING_8X16], txfm_cache);

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,

-                     seg_mvs[PARTITIONING_8X8], txfm_cache);

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,

-                     seg_mvs[PARTITIONING_4X4], txfm_cache);

-  } else {

-    int sr;

+  rd_check_segment_txsize(cpi, x, &bsi, seg_mvs, mi_row, mi_col);

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,

-                     seg_mvs[PARTITIONING_8X8], txfm_cache);

-    if (bsi.segment_rd < best_rd) {

-      int tmp_col_min = x->mv_col_min;

-      int tmp_col_max = x->mv_col_max;

-      int tmp_row_min = x->mv_row_min;

-      int tmp_row_max = x->mv_row_max;

-      vp9_clamp_mv_min_max(x, best_ref_mv);

-      /* Get 8x8 result */

-      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;

-      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;

-      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;

-      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;

-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range

-       * according to the closeness of 2 MV. */

-      /* block 8X16 */

-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[0]);

-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[1]);

-      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,

-                       seg_mvs[PARTITIONING_8X16], txfm_cache);

-      /* block 16X8 */

-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[0]);

-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[1]);

-      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,

-                       seg_mvs[PARTITIONING_16X8], txfm_cache);

-      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */

-      /* Not skip 4x4 if speed=0 (good quality) */

-      if (cpi->sf.no_skip_block4x4_search ||

-          bsi.segment_num == PARTITIONING_8X8) {

-        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */

-        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;

-        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,

-                         seg_mvs[PARTITIONING_4X4], txfm_cache);

-      }

-      /* restore UMV window */

-      x->mv_col_min = tmp_col_min;

-      x->mv_col_max = tmp_col_max;

-      x->mv_row_min = tmp_row_min;

-      x->mv_row_max = tmp_row_max;

-    }

-  }

   /* set it to the best */

-  for (i = 0; i < 16; i++) {

-    BLOCKD *bd = &x->e_mbd.block[i];

-    bd->bmi.as_mv[0].as_int = bsi.mvs[i].as_int;

-    if (mbmi->second_ref_frame > 0)

-      bd->bmi.as_mv[1].as_int = bsi.second_mvs[i].as_int;

-    x->e_mbd.eobs[i] = bsi.eobs[i];

+  for (i = 0; i < 4; i++) {

+    x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;

+    if (mbmi->ref_frame[1] > 0)

+      x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =

+      bsi.second_mvs[i].as_int;

+    x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];

-  *returntotrate = bsi.r;

-  *returndistortion = bsi.d;

-  *returnyrate = bsi.segment_yrate;

-  *skippable = bsi.txfm_size == TX_4X4 ?

-                    vp9_mby_is_skippable_4x4(&x->e_mbd) :

-                    vp9_mby_is_skippable_8x8(&x->e_mbd);

   /* save partitions */

-  mbmi->txfm_size = bsi.txfm_size;

-  mbmi->partitioning = bsi.segment_num;

-  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];

+  x->partition_info->count = 4;

   for (i = 0; i < x->partition_info->count; i++) {

-    int j;

-    j = vp9_mbsplit_offset[bsi.segment_num][i];

-    x->partition_info->bmi[i].mode = bsi.modes[j];

-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;

-    if (mbmi->second_ref_frame > 0)

-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;

+    x->partition_info->bmi[i].mode = bsi.modes[i];

+    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[i].as_mv;

+    if (mbmi->ref_frame[1] > 0)

+      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[i].as_mv;

/*

    * used to set mbmi->mv.as_int

*/

-  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;

-  if (mbmi->second_ref_frame > 0)

-    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;

+  x->partition_info->bmi[3].mv.as_int = bsi.mvs[3].as_int;

+  if (mbmi->ref_frame[1] > 0)

+    x->partition_info->bmi[3].second_mv.as_int = bsi.second_mvs[3].as_int;

+  *returntotrate = bsi.r;

+  *returndistortion = bsi.d;

+  *returnyrate = bsi.segment_yrate;

+  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);

+  mbmi->mode = bsi.modes[3];

   return (int)(bsi.segment_rd);

@@ -3169,18 +1587,17 @@

   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

   int_mv this_mv;

   int i;

-  int zero_seen = FALSE;

+  int zero_seen = 0;

   int best_index = 0;

   int best_sad = INT_MAX;

   int this_sad = INT_MAX;

-  BLOCK *b = &x->block[0];

-  uint8_t *src_y_ptr = *(b->base_src);

+  uint8_t *src_y_ptr = x->plane[0].src.buf;

   uint8_t *ref_y_ptr;

   int row_offset, col_offset;

   // Get the sad for each candidate reference mv

-  for (i = 0; i < 4; i++) {

+  for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {

     this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;

     // The list is at an end if we see 0 for a second time.

@@ -3193,7 +1610,7 @@

     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;

     // Find sad for current vector.

-    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,

                                            ref_y_ptr, ref_y_stride,

                                            0x7fffffff);

@@ -3208,150 +1625,64 @@

   x->mv_best_ref_index[ref_frame] = best_index;

-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {

-  int i;

-  MACROBLOCKD *xd = &x->e_mbd;

-  for (i = 0; i < 4; i++) {

-    int ib = vp9_i8x8_block[i];

-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[i];

-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[i];

-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[i];

-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[i];

-    // printf("%d,%d,%d,%d\n",

-    //       modes[0], modes[1], modes[2], modes[3]);

-  }

-  for (i = 0; i < 16; i++) {

-    xd->block[i].bmi = xd->mode_info_context->bmi[i];

-  }

-}

-extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);

-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {

-  int norm_cnt[MAX_REF_FRAMES];

-  const int *const rfct = cpi->count_mb_ref_frame_usage;

-  int intra_count = rfct[INTRA_FRAME];

-  int last_count  = rfct[LAST_FRAME];

-  int gf_count    = rfct[GOLDEN_FRAME];

-  int arf_count   = rfct[ALTREF_FRAME];

-  // Work out modified reference frame probabilities to use where prediction

-  // of the reference frame fails

-  if (pred_ref == INTRA_FRAME) {

-    norm_cnt[0] = 0;

-    norm_cnt[1] = last_count;

-    norm_cnt[2] = gf_count;

-    norm_cnt[3] = arf_count;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[0] = 0;    // This branch implicit

-  } else if (pred_ref == LAST_FRAME) {

-    norm_cnt[0] = intra_count;

-    norm_cnt[1] = 0;

-    norm_cnt[2] = gf_count;

-    norm_cnt[3] = arf_count;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[1] = 0;    // This branch implicit

-  } else if (pred_ref == GOLDEN_FRAME) {

-    norm_cnt[0] = intra_count;

-    norm_cnt[1] = last_count;

-    norm_cnt[2] = 0;

-    norm_cnt[3] = arf_count;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[2] = 0;  // This branch implicit

+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,

+                                     unsigned int *ref_costs_single,

+                                     unsigned int *ref_costs_comp,

+                                     vp9_prob *comp_mode_p) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

+  int seg_ref_active = vp9_segfeature_active(xd, segment_id,

+                                             SEG_LVL_REF_FRAME);

+  if (seg_ref_active) {

+    vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));

+    vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));

+    *comp_mode_p = 128;

   } else {

-    norm_cnt[0] = intra_count;

-    norm_cnt[1] = last_count;

-    norm_cnt[2] = gf_count;

-    norm_cnt[3] = 0;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[2] = 0;  // This branch implicit

-  }

-}

+    vp9_prob intra_inter_p = vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER);

+    vp9_prob comp_inter_p = 128;

-static INLINE unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1,

-                                     int idx, int val, int weight) {

-  unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;

-  unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;

-  // weight is 16-bit fixed point, so this basically calculates:

-  // 0.5 + weight * cost1 + (1.0 - weight) * cost0

-  return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;

-}

+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {

+      comp_inter_p = vp9_get_pred_prob(cm, xd, PRED_COMP_INTER_INTER);

+      *comp_mode_p = comp_inter_p;

+    } else {

+      *comp_mode_p = 128;

+    }

-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  vp9_prob *mod_refprobs;

+    ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);

-  unsigned int cost;

-  int pred_ref;

-  int pred_flag;

-  int pred_ctx;

-  int i;

+    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {

+      vp9_prob ref_single_p1 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P1);

+      vp9_prob ref_single_p2 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P2);

+      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);

-  vp9_prob pred_prob, new_pred_prob;

-  int seg_ref_active;

-  int seg_ref_count = 0;

-  seg_ref_active = vp9_segfeature_active(xd,

-                                         segment_id,

-                                         SEG_LVL_REF_FRAME);

+      if (cm->comp_pred_mode == HYBRID_PREDICTION)

+        base_cost += vp9_cost_bit(comp_inter_p, 0);

-  if (seg_ref_active) {

-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME)  +

-                    vp9_check_segref(xd, segment_id, LAST_FRAME)   +

-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

-  }

-  // Get the predicted reference for this mb

-  pred_ref = vp9_get_pred_ref(cm, xd);

-  // Get the context probability for the prediction flag (based on last frame)

-  pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

-  // Predict probability for current frame based on stats so far

-  pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);

-  new_pred_prob = get_binary_prob(cpi->ref_pred_count[pred_ctx][0],

-                                  cpi->ref_pred_count[pred_ctx][1]);

-  // Get the set of probabilities to use if prediction fails

-  mod_refprobs = cm->mod_refprobs[pred_ref];

-  // For each possible selected reference frame work out a cost.

-  for (i = 0; i < MAX_REF_FRAMES; i++) {

-    if (seg_ref_active && seg_ref_count == 1) {

-      cost = 0;

+      ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =

+          ref_costs_single[ALTREF_FRAME] = base_cost;

+      ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);

+      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);

+      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);

+      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);

+      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);

     } else {

-      pred_flag = (i == pred_ref);

+      ref_costs_single[LAST_FRAME]   = 512;

+      ref_costs_single[GOLDEN_FRAME] = 512;

+      ref_costs_single[ALTREF_FRAME] = 512;

+    }

+    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {

+      vp9_prob ref_comp_p = vp9_get_pred_prob(cm, xd, PRED_COMP_REF_P);

+      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);

-      // Get the prediction for the current mb

-      cost = weighted_cost(&pred_prob, &new_pred_prob, 0,

-                           pred_flag, cpi->seg0_progress);

-      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.

+      if (cm->comp_pred_mode == HYBRID_PREDICTION)

+        base_cost += vp9_cost_bit(comp_inter_p, 1);

-      // for incorrectly predicted cases

-      if (! pred_flag) {

-        vp9_prob curframe_mod_refprobs[3];

-        if (cpi->seg0_progress) {

-          estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);

-        } else {

-          vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));

-        }

-        cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,

-                              (i != INTRA_FRAME), cpi->seg0_progress);

-        if (i != INTRA_FRAME) {

-          cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,

-                                (i != LAST_FRAME), cpi->seg0_progress);

-          if (i != LAST_FRAME) {

-            cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,

-                                  (i != GOLDEN_FRAME), cpi->seg0_progress);

-          }

-        }

-      }

+      ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);

+      ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);

+    } else {

+      ref_costs_comp[LAST_FRAME]   = 512;

+      ref_costs_comp[GOLDEN_FRAME] = 512;

-    ref_costs[i] = cost;

@@ -3368,11 +1699,11 @@

   // restored if we decide to encode this way

   ctx->skip = x->skip;

   ctx->best_mode_index = mode_index;

-  vpx_memcpy(&ctx->mic, xd->mode_info_context,

-             sizeof(MODE_INFO));

+  ctx->mic = *xd->mode_info_context;

   if (partition)

-    vpx_memcpy(&ctx->partition_info, partition,

-               sizeof(PARTITION_INFO));

+    ctx->partition_info = *partition;

   ctx->best_ref_mv.as_int = ref_mv->as_int;

   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;

@@ -3383,82 +1714,69 @@

   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));

-static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x,

-                            int *rate2, int *distortion2, int *rate_y,

-                            int *distortion, int* rate_uv, int *distortion_uv,

-                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {

-  int y_skippable, uv_skippable;

+static void setup_pred_block(const MACROBLOCKD *xd,

+                             struct buf_2d dst[MAX_MB_PLANE],

+                             const YV12_BUFFER_CONFIG *src,

+                             int mi_row, int mi_col,

+                             const struct scale_factors *scale,

+                             const struct scale_factors *scale_uv) {

+  int i;

-  // Y cost and distortion

-  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);

+  dst[0].buf = src->y_buffer;

+  dst[0].stride = src->y_stride;

+  dst[1].buf = src->u_buffer;

+  dst[2].buf = src->v_buffer;

+  dst[1].stride = dst[2].stride = src->uv_stride;

+#if CONFIG_ALPHA

+  dst[3].buf = src->alpha_buffer;

+  dst[3].stride = src->alpha_stride;

+#endif

-  *rate2 += *rate_y;

-  *distortion2 += *distortion;

-  // UV cost and distortion

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    x->e_mbd.predictor, x->src.uv_stride);

-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4 &&

-      x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED &&

-      x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)

-    rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,

-                         cpi->common.full_pixel, &uv_skippable, 1);

-  else

-    rd_inter16x16_uv_4x4(cpi, x, rate_uv, distortion_uv,

-                         cpi->common.full_pixel, &uv_skippable, 1);

-  *rate2 += *rate_uv;

-  *distortion2 += *distortion_uv;

-  *skippable = y_skippable && uv_skippable;

+  // TODO(jkoleszar): Make scale factors per-plane data

+  for (i = 0; i < MAX_MB_PLANE; i++) {

+    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,

+                     i ? scale_uv : scale,

+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);

+  }

 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,

                                int idx, MV_REFERENCE_FRAME frame_type,

-                               int block_size,

-                               int mb_row, int mb_col,

+                               enum BlockSize block_size,

+                               int mi_row, int mi_col,

                                int_mv frame_nearest_mv[MAX_REF_FRAMES],

                                int_mv frame_near_mv[MAX_REF_FRAMES],

-                               int frame_mdcounts[4][4],

-                               YV12_BUFFER_CONFIG yv12_mb[4],

+                               struct buf_2d yv12_mb[4][MAX_MB_PLANE],

                                struct scale_factors scale[MAX_REF_FRAMES]) {

   VP9_COMMON *cm = &cpi->common;

   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  int use_prev_in_find_mv_refs, use_prev_in_find_best_ref;

   // set up scaling factors

   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];

   scale[frame_type].x_offset_q4 =

-      (mb_col * 16 * scale[frame_type].x_num / scale[frame_type].x_den) & 0xf;

+      ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,

+       VP9_REF_SCALE_SHIFT) & 0xf;

   scale[frame_type].y_offset_q4 =

-      (mb_row * 16 * scale[frame_type].y_num / scale[frame_type].y_den) & 0xf;

+      ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,

+       VP9_REF_SCALE_SHIFT) & 0xf;

   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this

   // use the UV scaling factors.

-  setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col,

+  setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col,

                    &scale[frame_type], &scale[frame_type]);

   // Gets an initial list of candidate vectors from neighbours and orders them

-  use_prev_in_find_mv_refs = cm->width == cm->last_width &&

-                             cm->height == cm->last_height &&

-                             !cpi->common.error_resilient_mode;

   vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,

-                   use_prev_in_find_mv_refs ? xd->prev_mode_info_context : NULL,

+                   xd->prev_mode_info_context,

                    frame_type,

                    mbmi->ref_mvs[frame_type],

                    cpi->common.ref_frame_sign_bias);

   // Candidate refinement carried out at encoder and decoder

-  use_prev_in_find_best_ref =

-      scale[frame_type].x_num == scale[frame_type].x_den &&

-      scale[frame_type].y_num == scale[frame_type].y_den &&

-      !cm->error_resilient_mode &&

-      !cm->frame_parallel_decoding_mode;

   vp9_find_best_ref_mvs(xd,

-                        use_prev_in_find_best_ref ?

-                            yv12_mb[frame_type].y_buffer : NULL,

-                        yv12->y_stride,

                         mbmi->ref_mvs[frame_type],

                         &frame_nearest_mv[frame_type],

                         &frame_near_mv[frame_type]);

@@ -3466,9 +1784,9 @@

   // Further refinement that is encode side only to test the top few candidates

   // in full and choose the best as the centre point for subsequent searches.

   // The current implementation doesn't support scaling.

-  if (scale[frame_type].x_num == scale[frame_type].x_den &&

-      scale[frame_type].y_num == scale[frame_type].y_den)

-    mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride,

+  if (scale[frame_type].x_scale_fp == (1 << VP9_REF_SCALE_SHIFT) &&

+      scale[frame_type].y_scale_fp == (1 << VP9_REF_SCALE_SHIFT))

+    mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,

             frame_type, block_size);

@@ -3485,7 +1803,10 @@

   // TODO(debargha): Implement the functions by interpolating from a

   // look-up table

   vp9_clear_system_state();

-  {

+  if (var == 0 || n == 0) {

+    *rate = 0;

+    *dist = 0;

+  } else {

     double D, R;

     double s2 = (double) var / n;

     double s = sqrt(s2);

@@ -3515,44 +1836,229 @@

   vp9_clear_system_state();

+static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,

+                                           struct macroblockd_plane *pd) {

+  return get_block_size(plane_block_width(bsize, pd),

+                        plane_block_height(bsize, pd));

+}

+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,

+                            MACROBLOCK *x, MACROBLOCKD *xd,

+                            int *out_rate_sum, int *out_dist_sum) {

+  // Note our transform coeffs are 8 times an orthogonal transform.

+  // Hence quantizer step is also 8 times. To get effective quantizer

+  // we need to divide by 8 before sending to modeling function.

+  unsigned int sse, var;

+  int i, rate_sum = 0, dist_sum = 0;

+  for (i = 0; i < MAX_MB_PLANE; ++i) {

+    struct macroblock_plane *const p = &x->plane[i];

+    struct macroblockd_plane *const pd = &xd->plane[i];

+    // TODO(dkovalev) the same code in get_plane_block_size

+    const int bw = plane_block_width(bsize, pd);

+    const int bh = plane_block_height(bsize, pd);

+    const enum BlockSize bs = get_block_size(bw, bh);

+    int rate, dist;

+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,

+                             pd->dst.buf, pd->dst.stride, &sse);

+    model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist);

+    rate_sum += rate;

+    dist_sum += dist;

+  }

+  *out_rate_sum = rate_sum;

+  *out_dist_sum = dist_sum;

+}

+static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

+  const int m = vp9_switchable_interp_map[mbmi->interp_filter];

+  return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

+}

+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,

+                                    BLOCK_SIZE_TYPE bsize,

+                                    int_mv *frame_mv,

+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,

+                                    int mi_row, int mi_col,

+                                    int_mv single_newmv[MAX_REF_FRAMES]) {

+  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  int refs[2] = { mbmi->ref_frame[0],

+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };

+  int_mv ref_mv[2];

+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);

+  int ite;

+  // Prediction buffer from second frame.

+  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));

+  // Do joint motion search in compound mode to get more accurate mv.

+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};

+  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};

+  struct buf_2d scaled_first_yv12;

+  int last_besterr[2] = {INT_MAX, INT_MAX};

+  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];

+  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];

+  if (scaled_ref_frame[0]) {

+    int i;

+    // Swap out the reference frame for a version that's been scaled to

+    // match the resolution of the current frame, allowing the existing

+    // motion search code to be used without additional modifications.

+    for (i = 0; i < MAX_MB_PLANE; i++)

+      backup_yv12[i] = xd->plane[i].pre[0];

+    setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,

+                     NULL, NULL);

+  }

+  if (scaled_ref_frame[1]) {

+    int i;

+    for (i = 0; i < MAX_MB_PLANE; i++)

+      backup_second_yv12[i] = xd->plane[i].pre[1];

+    setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,

+                     NULL, NULL);

+  }

+  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],

+                                          mi_row, mi_col);

+  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],

+                                          mi_row, mi_col);

+  scaled_first_yv12 = xd->plane[0].pre[0];

+  // Initialize mv using single prediction mode result.

+  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;

+  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;

+  // Allow joint search multiple times iteratively for each ref frame

+  // and break out the search loop if it couldn't find better mv.

+  for (ite = 0; ite < 4; ite++) {

+    struct buf_2d ref_yv12[2];

+    int bestsme = INT_MAX;

+    int sadpb = x->sadperbit16;

+    int_mv tmp_mv;

+    int search_range = 3;

+    int tmp_col_min = x->mv_col_min;

+    int tmp_col_max = x->mv_col_max;

+    int tmp_row_min = x->mv_row_min;

+    int tmp_row_max = x->mv_row_max;

+    int id = ite % 2;

+    // Initialized here because of compiler problem in Visual Studio.

+    ref_yv12[0] = xd->plane[0].pre[0];

+    ref_yv12[1] = xd->plane[0].pre[1];

+    // Get pred block from second frame.

+    vp9_build_inter_predictor(ref_yv12[!id].buf,

+                              ref_yv12[!id].stride,

+                              second_pred, pw,

+                              &frame_mv[refs[!id]],

+                              &xd->scale_factor[!id],

+                              pw, ph, 0,

+                              &xd->subpix);

+    // Compound motion search on first ref frame.

+    if (id)

+      xd->plane[0].pre[0] = ref_yv12[id];

+    vp9_clamp_mv_min_max(x, &ref_mv[id]);

+    // Use mv result from single mode as mvp.

+    tmp_mv.as_int = frame_mv[refs[id]].as_int;

+    tmp_mv.as_mv.col >>= 3;

+    tmp_mv.as_mv.row >>= 3;

+    // Small-range full-pixel motion search

+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,

+                                       search_range,

+                                       &cpi->fn_ptr[block_size],

+                                       x->nmvjointcost, x->mvcost,

+                                       &ref_mv[id], second_pred,

+                                       pw, ph);

+    x->mv_col_min = tmp_col_min;

+    x->mv_col_max = tmp_col_max;

+    x->mv_row_min = tmp_row_min;

+    x->mv_row_max = tmp_row_max;

+    if (bestsme < INT_MAX) {

+      int dis; /* TODO: use dis in distortion calculation later. */

+      unsigned int sse;

+      bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,

+                                             &ref_mv[id],

+                                             x->errorperbit,

+                                             &cpi->fn_ptr[block_size],

+                                             x->nmvjointcost, x->mvcost,

+                                             &dis, &sse, second_pred,

+                                             pw, ph);

+    }

+    if (id)

+      xd->plane[0].pre[0] = scaled_first_yv12;

+    if (bestsme < last_besterr[id]) {

+      frame_mv[refs[id]].as_int = tmp_mv.as_int;

+      last_besterr[id] = bestsme;

+    } else {

+      break;

+    }

+  }

+  // restore the predictor

+  if (scaled_ref_frame[0]) {

+    int i;

+    for (i = 0; i < MAX_MB_PLANE; i++)

+      xd->plane[i].pre[0] = backup_yv12[i];

+  }

+  if (scaled_ref_frame[1]) {

+    int i;

+    for (i = 0; i < MAX_MB_PLANE; i++)

+      xd->plane[i].pre[1] = backup_second_yv12[i];

+  }

+  vpx_free(second_pred);

+}

 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                                 enum BlockSize block_size,

-                                 int *saddone, int near_sadidx[],

-                                 int mdcounts[4], int64_t txfm_cache[],

+                                 BLOCK_SIZE_TYPE bsize,

+                                 int64_t txfm_cache[],

                                  int *rate2, int *distortion, int *skippable,

-                                 int *compmode_cost,

-#if CONFIG_COMP_INTERINTRA_PRED

-                                 int *compmode_interintra_cost,

-#endif

                                  int *rate_y, int *distortion_y,

                                  int *rate_uv, int *distortion_uv,

                                  int *mode_excluded, int *disable_skip,

-                                 int mode_index,

                                  INTERPOLATIONFILTERTYPE *best_filter,

-                                 int_mv frame_mv[MB_MODE_COUNT]

-                                                [MAX_REF_FRAMES],

-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,

-                                 int mb_row, int mb_col) {

+                                 int_mv *frame_mv,

+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,

+                                 int mi_row, int mi_col,

+                                 int_mv single_newmv[MAX_REF_FRAMES]) {

+  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &x->e_mbd;

+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);

+  const enum BlockSize uv_block_size = get_plane_block_size(bsize,

+                                                            &xd->plane[1]);

   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &xd->block[0];

-  const int is_comp_pred = (mbmi->second_ref_frame > 0);

-#if CONFIG_COMP_INTERINTRA_PRED

-  const int is_comp_interintra_pred = (mbmi->second_ref_frame == INTRA_FRAME);

-#endif

+  const int is_comp_pred = (mbmi->ref_frame[1] > 0);

   const int num_refs = is_comp_pred ? 2 : 1;

   const int this_mode = mbmi->mode;

   int i;

-  int refs[2] = { mbmi->ref_frame,

-                  (mbmi->second_ref_frame < 0 ? 0 : mbmi->second_ref_frame) };

+  int refs[2] = { mbmi->ref_frame[0],

+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };

   int_mv cur_mv[2];

   int_mv ref_mv[2];

   int64_t this_rd = 0;

-  unsigned char tmp_ybuf[64 * 64];

-  unsigned char tmp_ubuf[32 * 32];

-  unsigned char tmp_vbuf[32 * 32];

+  unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];

   int pred_exists = 0;

   int interpolating_intpel_seen = 0;

   int intpel_mv;

@@ -3564,19 +2070,27 @@

       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];

       if (is_comp_pred) {

-        if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||

-            frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)

+        // Initialize mv using single prediction mode result.

+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;

+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;

+        if (cpi->sf.comp_inter_joint_search_thresh < bsize)

+          iterative_motion_search(cpi, x, bsize, frame_mv, scaled_ref_frame,

+                                  mi_row, mi_col, single_newmv);

+        if (frame_mv[refs[0]].as_int == INVALID_MV ||

+            frame_mv[refs[1]].as_int == INVALID_MV)

           return INT64_MAX;

-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],

+        *rate2 += vp9_mv_bit_cost(&frame_mv[refs[0]],

                                   &ref_mv[0],

                                   x->nmvjointcost, x->mvcost, 96,

                                   x->e_mbd.allow_high_precision_mv);

-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],

+        *rate2 += vp9_mv_bit_cost(&frame_mv[refs[1]],

                                   &ref_mv[1],

                                   x->nmvjointcost, x->mvcost, 96,

                                   x->e_mbd.allow_high_precision_mv);

       } else {

-        YV12_BUFFER_CONFIG backup_yv12 = xd->pre;

+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};

         int bestsme = INT_MAX;

         int further_steps, step_param = cpi->sf.first_step;

         int sadpb = x->sadperbit16;

@@ -3588,14 +2102,17 @@

         int tmp_row_min = x->mv_row_min;

         int tmp_row_max = x->mv_row_max;

-        if (scaled_ref_frame) {

+        if (scaled_ref_frame[0]) {

+          int i;

           // Swap out the reference frame for a version that's been scaled to

           // match the resolution of the current frame, allowing the existing

           // motion search code to be used without additional modifications.

-          xd->pre = *scaled_ref_frame;

-          xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16;

-          xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;

-          xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;

+          for (i = 0; i < MAX_MB_PLANE; i++)

+            backup_yv12[i] = xd->plane[i].pre[0];

+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,

+                           NULL, NULL);

         vp9_clamp_mv_min_max(x, &ref_mv[0]);

@@ -3615,7 +2132,7 @@

         // Further step/diamond searches as necessary

         further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;

-        bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,

+        bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,

                                          sadpb, further_steps, 1,

                                          &cpi->fn_ptr[block_size],

                                          &ref_mv[0], &tmp_mv);

@@ -3628,7 +2145,7 @@

         if (bestsme < INT_MAX) {

           int dis; /* TODO: use dis in distortion calculation later. */

           unsigned int sse;

-          cpi->find_fractional_mv_step(x, b, d, &tmp_mv,

+          cpi->find_fractional_mv_step(x, &tmp_mv,

                                        &ref_mv[0],

                                        x->errorperbit,

                                        &cpi->fn_ptr[block_size],

@@ -3635,8 +2152,8 @@

                                        x->nmvjointcost, x->mvcost,

                                        &dis, &sse);

-        d->bmi.as_mv[0].as_int = tmp_mv.as_int;

-        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv[0].as_int;

+        frame_mv[refs[0]].as_int = tmp_mv.as_int;

+        single_newmv[refs[0]].as_int = tmp_mv.as_int;

         // Add the new motion vector cost to our rolling cost variable

         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],

@@ -3644,8 +2161,11 @@

                                   96, xd->allow_high_precision_mv);

         // restore the predictor, if required

-        if (scaled_ref_frame) {

-          xd->pre = backup_yv12;

+        if (scaled_ref_frame[0]) {

+          int i;

+          for (i = 0; i < MAX_MB_PLANE; i++)

+            xd->plane[i].pre[0] = backup_yv12[i];

       break;

@@ -3656,9 +2176,13 @@

       break;

   for (i = 0; i < num_refs; ++i) {

-    cur_mv[i] = frame_mv[this_mode][refs[i]];

+    cur_mv[i] = frame_mv[refs[i]];

     // Clip "next_nearest" so that it does not extend to far out of image

-    clamp_mv2(&cur_mv[i], xd);

+    if (this_mode == NEWMV)

+      assert(!clamp_mv2(&cur_mv[i], xd));

+    else

+      clamp_mv2(&cur_mv[i], xd);

     if (mv_check_bounds(x, &cur_mv[i]))

       return INT64_MAX;

     mbmi->mv[i].as_int = cur_mv[i].as_int;

@@ -3669,24 +2193,8 @@

    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other

    * words if you present them in that order, the second one is always known

    * if the first is known */

-  *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),

-                                is_comp_pred);

   *rate2 += vp9_cost_mv_ref(cpi, this_mode,

-                            mbmi->mb_mode_context[mbmi->ref_frame]);

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (!is_comp_pred) {

-    *compmode_interintra_cost = vp9_cost_bit(cm->fc.interintra_prob,

-                                             is_comp_interintra_pred);

-    if (is_comp_interintra_pred) {

-      *compmode_interintra_cost +=

-          x->mbmode_cost[xd->frame_type][mbmi->interintra_mode];

-#if SEPARATE_INTERINTRA_UV

-      *compmode_interintra_cost +=

-          x->intra_uv_mode_cost[xd->frame_type][mbmi->interintra_uv_mode];

-#endif

-    }

-  }

-#endif

+                            mbmi->mb_mode_context[mbmi->ref_frame[0]]);

   pred_exists = 0;

   interpolating_intpel_seen = 0;

@@ -3698,342 +2206,106 @@

                  (mbmi->mv[1].as_mv.col & 15) == 0;

   // Search for best switchable filter by checking the variance of

   // pred error irrespective of whether the filter will be used

-  if (block_size == BLOCK_64X64) {

-    int switchable_filter_index, newbest;

-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;

-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;

-    for (switchable_filter_index = 0;

-         switchable_filter_index < VP9_SWITCHABLE_FILTERS;

-         ++switchable_filter_index) {

+  if (cpi->speed > 4) {

+    *best_filter = EIGHTTAP;

+  } else {

+    int i, newbest;

+    int tmp_rate_sum = 0, tmp_dist_sum = 0;

+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

       int rs = 0;

-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];

-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];

+      const int is_intpel_interp = intpel_mv &&

+                                   vp9_is_interpolating_filter[filter];

+      mbmi->interp_filter = filter;

+      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);

-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

-        const int m = vp9_switchable_interp_map[mbmi->interp_filter];

-        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

-      }

-      if (interpolating_intpel_seen && intpel_mv &&

-          vp9_is_interpolating_filter[mbmi->interp_filter]) {

-        rd = RDCOST(x->rdmult, x->rddiv,

-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,

-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);

+      if (cm->mcomp_filter_type == SWITCHABLE)

+        rs = get_switchable_rate(cm, x);

+      if (interpolating_intpel_seen && is_intpel_interp) {

+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);

       } else {

-        unsigned int sse, var;

-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;

-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;

-        vp9_build_inter64x64_predictors_sb(xd,

-                                           xd->dst.y_buffer,

-                                           xd->dst.u_buffer,

-                                           xd->dst.v_buffer,

-                                           xd->dst.y_stride,

-                                           xd->dst.uv_stride,

-                                           mb_row, mb_col);

-        var = vp9_variance64x64(*(b->base_src), b->src_stride,

-                                xd->dst.y_buffer, xd->dst.y_stride, &sse);

-        // Note our transform coeffs are 8 times an orthogonal transform.

-        // Hence quantizer step is also 8 times. To get effective quantizer

-        // we need to divide by 8 before sending to modeling function.

-        model_rd_from_var_lapndz(var, 64 * 64, xd->block[0].dequant[1] >> 3,

-                                 &tmp_rate_y, &tmp_dist_y);

-        var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,

-                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);

-        model_rd_from_var_lapndz(var, 32 * 32, xd->block[16].dequant[1] >> 3,

-                                 &tmp_rate_u, &tmp_dist_u);

-        var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,

-                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);

-        model_rd_from_var_lapndz(var, 32 * 32, xd->block[20].dequant[1] >> 3,

-                                 &tmp_rate_v, &tmp_dist_v);

-        rd = RDCOST(x->rdmult, x->rddiv,

-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);

-        if (!interpolating_intpel_seen && intpel_mv &&

-            vp9_is_interpolating_filter[mbmi->interp_filter]) {

-          tmp_rate_y_i = tmp_rate_y;

-          tmp_rate_u_i = tmp_rate_u;

-          tmp_rate_v_i = tmp_rate_v;

-          tmp_dist_y_i = tmp_dist_y;

-          tmp_dist_u_i = tmp_dist_u;

-          tmp_dist_v_i = tmp_dist_v;

+        int rate_sum = 0, dist_sum = 0;

+        vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);

+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);

+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);

+        if (!interpolating_intpel_seen && is_intpel_interp) {

+          tmp_rate_sum = rate_sum;

+          tmp_dist_sum = dist_sum;

-      newbest = (switchable_filter_index == 0 || rd < best_rd);

+      newbest = i == 0 || rd < best_rd;

       if (newbest) {

         best_rd = rd;

         *best_filter = mbmi->interp_filter;

       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||

           (cm->mcomp_filter_type != SWITCHABLE &&

            cm->mcomp_filter_type == mbmi->interp_filter)) {

-        int i;

-        for (i = 0; i < 64; ++i)

-          vpx_memcpy(tmp_ybuf + i * 64,

-                     xd->dst.y_buffer + i * xd->dst.y_stride,

-                     sizeof(unsigned char) * 64);

-        for (i = 0; i < 32; ++i)

-          vpx_memcpy(tmp_ubuf + i * 32,

-                     xd->dst.u_buffer + i * xd->dst.uv_stride,

-                     sizeof(unsigned char) * 32);

-        for (i = 0; i < 32; ++i)

-          vpx_memcpy(tmp_vbuf + i * 32,

-                     xd->dst.v_buffer + i * xd->dst.uv_stride,

-                     sizeof(unsigned char) * 32);

-        pred_exists = 1;

-      }

-      interpolating_intpel_seen |=

-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];

-    }

-  } else if (block_size == BLOCK_32X32) {

-    int switchable_filter_index, newbest;

-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;

-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;

-    for (switchable_filter_index = 0;

-       switchable_filter_index < VP9_SWITCHABLE_FILTERS;

-       ++switchable_filter_index) {

-      int rs = 0;

-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];

-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

-        const int m = vp9_switchable_interp_map[mbmi->interp_filter];

-        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

-      }

-      if (interpolating_intpel_seen && intpel_mv &&

-          vp9_is_interpolating_filter[mbmi->interp_filter]) {

-        rd = RDCOST(x->rdmult, x->rddiv,

-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,

-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);

-      } else {

-        unsigned int sse, var;

-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;

-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;

-        vp9_build_inter32x32_predictors_sb(xd,

-                                           xd->dst.y_buffer,

-                                           xd->dst.u_buffer,

-                                           xd->dst.v_buffer,

-                                           xd->dst.y_stride,

-                                           xd->dst.uv_stride,

-                                           mb_row, mb_col);

-        var = vp9_variance32x32(*(b->base_src), b->src_stride,

-                                xd->dst.y_buffer, xd->dst.y_stride, &sse);

-        // Note our transform coeffs are 8 times an orthogonal transform.

-        // Hence quantizer step is also 8 times. To get effective quantizer

-        // we need to divide by 8 before sending to modeling function.

-        model_rd_from_var_lapndz(var, 32 * 32, xd->block[0].dequant[1] >> 3,

-                                 &tmp_rate_y, &tmp_dist_y);

-        var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,

-                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);

-        model_rd_from_var_lapndz(var, 16 * 16, xd->block[16].dequant[1] >> 3,

-                                 &tmp_rate_u, &tmp_dist_u);

-        var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,

-                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);

-        model_rd_from_var_lapndz(var, 16 * 16, xd->block[20].dequant[1] >> 3,

-                                 &tmp_rate_v, &tmp_dist_v);

-        rd = RDCOST(x->rdmult, x->rddiv,

-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);

-        if (!interpolating_intpel_seen && intpel_mv &&

-            vp9_is_interpolating_filter[mbmi->interp_filter]) {

-          tmp_rate_y_i = tmp_rate_y;

-          tmp_rate_u_i = tmp_rate_u;

-          tmp_rate_v_i = tmp_rate_v;

-          tmp_dist_y_i = tmp_dist_y;

-          tmp_dist_u_i = tmp_dist_u;

-          tmp_dist_v_i = tmp_dist_v;

+        int p;

+        for (p = 0; p < MAX_MB_PLANE; p++) {

+          const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;

+          const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;

+          int i;

+          for (i = 0; i < y; i++)

+            vpx_memcpy(&tmp_buf[p][64 * i],

+                       xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);

-      }

-      newbest = (switchable_filter_index == 0 || rd < best_rd);

-      if (newbest) {

-        best_rd = rd;

-        *best_filter = mbmi->interp_filter;

-      }

-      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||

-          (cm->mcomp_filter_type != SWITCHABLE &&

-           cm->mcomp_filter_type == mbmi->interp_filter)) {

-        int i;

-        for (i = 0; i < 32; ++i)

-          vpx_memcpy(tmp_ybuf + i * 64,

-                     xd->dst.y_buffer + i * xd->dst.y_stride,

-                     sizeof(unsigned char) * 32);

-        for (i = 0; i < 16; ++i)

-          vpx_memcpy(tmp_ubuf + i * 32,

-                     xd->dst.u_buffer + i * xd->dst.uv_stride,

-                     sizeof(unsigned char) * 16);

-        for (i = 0; i < 16; ++i)

-          vpx_memcpy(tmp_vbuf + i * 32,

-                     xd->dst.v_buffer + i * xd->dst.uv_stride,

-                     sizeof(unsigned char) * 16);

         pred_exists = 1;

-      interpolating_intpel_seen |=

-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];

+      interpolating_intpel_seen |= is_intpel_interp;

-  } else {

-    int switchable_filter_index, newbest;

-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;

-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;

-    assert(block_size == BLOCK_16X16);

-    for (switchable_filter_index = 0;

-       switchable_filter_index < VP9_SWITCHABLE_FILTERS;

-       ++switchable_filter_index) {

-      int rs = 0;

-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];

-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

-        const int m = vp9_switchable_interp_map[mbmi->interp_filter];

-        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

-      }

-      if (interpolating_intpel_seen && intpel_mv &&

-          vp9_is_interpolating_filter[mbmi->interp_filter]) {

-        rd = RDCOST(x->rdmult, x->rddiv,

-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,

-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);

-      } else {

-        unsigned int sse, var;

-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;

-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;

-        vp9_build_inter16x16_predictors_mb(xd, xd->predictor,

-                                           xd->predictor + 256,

-                                           xd->predictor + 320,

-                                           16, 8, mb_row, mb_col);

-        var = vp9_variance16x16(*(b->base_src), b->src_stride,

-                                xd->predictor, 16, &sse);

-        // Note our transform coeffs are 8 times an orthogonal transform.

-        // Hence quantizer step is also 8 times. To get effective quantizer

-        // we need to divide by 8 before sending to modeling function.

-        model_rd_from_var_lapndz(var, 16 * 16, xd->block[0].dequant[1] >> 3,

-                                 &tmp_rate_y, &tmp_dist_y);

-        var = vp9_variance8x8(x->src.u_buffer, x->src.uv_stride,

-                              &xd->predictor[256], 8, &sse);

-        model_rd_from_var_lapndz(var, 8 * 8, xd->block[16].dequant[1] >> 3,

-                                 &tmp_rate_u, &tmp_dist_u);

-        var = vp9_variance8x8(x->src.v_buffer, x->src.uv_stride,

-                              &xd->predictor[320], 8, &sse);

-        model_rd_from_var_lapndz(var, 8 * 8, xd->block[20].dequant[1] >> 3,

-                                 &tmp_rate_v, &tmp_dist_v);

-        rd = RDCOST(x->rdmult, x->rddiv,

-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);

-        if (!interpolating_intpel_seen && intpel_mv &&

-            vp9_is_interpolating_filter[mbmi->interp_filter]) {

-          tmp_rate_y_i = tmp_rate_y;

-          tmp_rate_u_i = tmp_rate_u;

-          tmp_rate_v_i = tmp_rate_v;

-          tmp_dist_y_i = tmp_dist_y;

-          tmp_dist_u_i = tmp_dist_u;

-          tmp_dist_v_i = tmp_dist_v;

-        }

-      }

-      newbest = (switchable_filter_index == 0 || rd < best_rd);

-      if (newbest) {

-        best_rd = rd;

-        *best_filter = mbmi->interp_filter;

-      }

-      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||

-          (cm->mcomp_filter_type != SWITCHABLE &&

-           cm->mcomp_filter_type == mbmi->interp_filter)) {

-        vpx_memcpy(tmp_ybuf, xd->predictor, sizeof(unsigned char) * 256);

-        vpx_memcpy(tmp_ubuf, xd->predictor + 256, sizeof(unsigned char) * 64);

-        vpx_memcpy(tmp_vbuf, xd->predictor + 320, sizeof(unsigned char) * 64);

-        pred_exists = 1;

-      }

-      interpolating_intpel_seen |=

-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];

-    }

   // Set the appripriate filter

-  if (cm->mcomp_filter_type != SWITCHABLE)

-    mbmi->interp_filter = cm->mcomp_filter_type;

-  else

-    mbmi->interp_filter = *best_filter;

-  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?

+                             cm->mcomp_filter_type : *best_filter;

+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);

   if (pred_exists) {

-    if (block_size == BLOCK_64X64) {

-      for (i = 0; i < 64; ++i)

-        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,

-                   sizeof(unsigned char) * 64);

-      for (i = 0; i < 32; ++i)

-        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,

-                   sizeof(unsigned char) * 32);

-      for (i = 0; i < 32; ++i)

-        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,

-                   sizeof(unsigned char) * 32);

-    } else if (block_size == BLOCK_32X32) {

-      for (i = 0; i < 32; ++i)

-        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,

-                   sizeof(unsigned char) * 32);

-      for (i = 0; i < 16; ++i)

-        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,

-                   sizeof(unsigned char) * 16);

-      for (i = 0; i < 16; ++i)

-        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,

-                   sizeof(unsigned char) * 16);

-    } else {

-      vpx_memcpy(xd->predictor, tmp_ybuf, sizeof(unsigned char) * 256);

-      vpx_memcpy(xd->predictor + 256, tmp_ubuf, sizeof(unsigned char) * 64);

-      vpx_memcpy(xd->predictor + 320, tmp_vbuf, sizeof(unsigned char) * 64);

+    int p;

+    for (p = 0; p < MAX_MB_PLANE; p++) {

+      const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;

+      const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;

+      int i;

+      for (i = 0; i < y; i++)

+        vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,

+                   &tmp_buf[p][64 * i], x);

   } else {

     // Handles the special case when a filter that is not in the

     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level

-    if (block_size == BLOCK_64X64) {

-      vp9_build_inter64x64_predictors_sb(xd,

-                                         xd->dst.y_buffer,

-                                         xd->dst.u_buffer,

-                                         xd->dst.v_buffer,

-                                         xd->dst.y_stride,

-                                         xd->dst.uv_stride,

-                                         mb_row, mb_col);

-    } else if (block_size == BLOCK_32X32) {

-      vp9_build_inter32x32_predictors_sb(xd,

-                                         xd->dst.y_buffer,

-                                         xd->dst.u_buffer,

-                                         xd->dst.v_buffer,

-                                         xd->dst.y_stride,

-                                         xd->dst.uv_stride,

-                                         mb_row, mb_col);

-    } else {

-      vp9_build_inter16x16_predictors_mb(xd, xd->predictor,

-                                         xd->predictor + 256,

-                                         xd->predictor + 320,

-                                         16, 8, mb_row, mb_col);

-    }

+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);

-  if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

-    const int m = vp9_switchable_interp_map[mbmi->interp_filter];

-    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

-  }

+  if (cpi->common.mcomp_filter_type == SWITCHABLE)

+    *rate2 += get_switchable_rate(cm, x);

   if (cpi->active_map_enabled && x->active_ptr[0] == 0)

     x->skip = 1;

   else if (x->encode_breakout) {

     unsigned int var, sse;

-    int threshold = (xd->block[0].dequant[1]

-                     * xd->block[0].dequant[1] >> 4);

+    int threshold = (xd->plane[0].dequant[1]

+                     * xd->plane[0].dequant[1] >> 4);

     if (threshold < x->encode_breakout)

       threshold = x->encode_breakout;

-    if (block_size == BLOCK_64X64) {

-      var = vp9_variance64x64(*(b->base_src), b->src_stride,

-                              xd->dst.y_buffer, xd->dst.y_stride, &sse);

-    } else if (block_size == BLOCK_32X32) {

-      var = vp9_variance32x32(*(b->base_src), b->src_stride,

-                              xd->dst.y_buffer, xd->dst.y_stride, &sse);

-    } else {

-      assert(block_size == BLOCK_16X16);

-      var = vp9_variance16x16(*(b->base_src), b->src_stride,

-                              xd->predictor, 16, &sse);

-    }

+    var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,

+                                     x->plane[0].src.stride,

+                                     xd->plane[0].dst.buf,

+                                     xd->plane[0].dst.stride,

+                                     &sse);

     if ((int)sse < threshold) {

-      unsigned int q2dc = xd->block[0].dequant[0];

+      unsigned int q2dc = xd->plane[0].dequant[0];

       /* If there is no codeable 2nd order dc

          or a very small uniform pixel change change */

       if ((sse - var < q2dc * q2dc >> 4) ||

@@ -4040,26 +2312,17 @@

           (sse / 2 > var && sse - var < 64)) {

         // Check u and v to make sure skip is ok

         int sse2;

+        unsigned int sse2u, sse2v;

+        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,

+                                            x->plane[1].src.stride,

+                                            xd->plane[1].dst.buf,

+                                            xd->plane[1].dst.stride, &sse2u);

+        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,

+                                            x->plane[1].src.stride,

+                                            xd->plane[2].dst.buf,

+                                            xd->plane[1].dst.stride, &sse2v);

+        sse2 = sse2u + sse2v;

-        if (block_size == BLOCK_64X64) {

-          unsigned int sse2u, sse2v;

-          var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,

-                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);

-          var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,

-                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);

-          sse2 = sse2u + sse2v;

-        } else if (block_size == BLOCK_32X32) {

-          unsigned int sse2u, sse2v;

-          var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,

-                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);

-          var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,

-                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);

-          sse2 = sse2u + sse2v;

-        } else {

-          assert(block_size == BLOCK_16X16);

-          sse2 = vp9_uvsse(x);

-        }

         if (sse2 * 2 < threshold) {

           x->skip = 1;

           *distortion = sse + sse2;

@@ -4077,42 +2340,21 @@

   if (!x->skip) {

-    if (block_size == BLOCK_64X64) {

-      int skippable_y, skippable_uv;

+    int skippable_y, skippable_uv;

-      // Y cost and distortion

-      super_block_64_yrd(cpi, x, rate_y, distortion_y,

-                         &skippable_y, txfm_cache);

-      *rate2 += *rate_y;

-      *distortion += *distortion_y;

+    // Y cost and distortion

+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,

+                    bsize, txfm_cache);

-      rd_inter64x64_uv(cpi, x, rate_uv, distortion_uv,

-                       cm->full_pixel, &skippable_uv);

+    *rate2 += *rate_y;

+    *distortion += *distortion_y;

-      *rate2 += *rate_uv;

-      *distortion += *distortion_uv;

-      *skippable = skippable_y && skippable_uv;

-    } else if (block_size == BLOCK_32X32) {

-      int skippable_y, skippable_uv;

+    super_block_uvrd(cm, x, rate_uv, distortion_uv,

+                     &skippable_uv, bsize);

-      // Y cost and distortion

-      super_block_yrd(cpi, x, rate_y, distortion_y,

-                      &skippable_y, txfm_cache);

-      *rate2 += *rate_y;

-      *distortion += *distortion_y;

-      rd_inter32x32_uv(cpi, x, rate_uv, distortion_uv,

-                       cm->full_pixel, &skippable_uv);

-      *rate2 += *rate_uv;

-      *distortion += *distortion_uv;

-      *skippable = skippable_y && skippable_uv;

-    } else {

-      assert(block_size == BLOCK_16X16);

-      inter_mode_cost(cpi, x, rate2, distortion,

-                      rate_y, distortion_y, rate_uv, distortion_uv,

-                      skippable, txfm_cache);

-    }

+    *rate2 += *rate_uv;

+    *distortion += *distortion_uv;

+    *skippable = skippable_y && skippable_uv;

   if (!(*mode_excluded)) {

@@ -4121,1065 +2363,82 @@

     } else {

       *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;

-#endif

   return this_rd;  // if 0, this will be re-calculated by caller

-static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                               int mb_row, int mb_col,

-                               int *returnrate, int *returndistortion,

-                               int64_t *returnintra) {

-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,

-    VP9_ALT_FLAG };

+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                               int *returnrate, int *returndist,

+                               BLOCK_SIZE_TYPE bsize,

+                               PICK_MODE_CONTEXT *ctx) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &x->e_mbd;

-  union b_mode_info best_bmodes[16];

-  MB_MODE_INFO best_mbmode;

-  PARTITION_INFO best_partition;

-  int_mv best_ref_mv, second_best_ref_mv;

-  MB_PREDICTION_MODE this_mode;

-  MB_PREDICTION_MODE best_mode = DC_PRED;

-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  int i, best_mode_index = 0;

-  int mode8x8[4];

-  unsigned char segment_id = mbmi->segment_id;

-  int mode_index;

-  int mdcounts[4];

-  int rate, distortion;

-  int rate2, distortion2;

-  int64_t best_txfm_rd[NB_TXFM_MODES];

-  int64_t best_txfm_diff[NB_TXFM_MODES];

-  int64_t best_pred_diff[NB_PREDICTION_TYPES];

-  int64_t best_pred_rd[NB_PREDICTION_TYPES];

-  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;

-#if CONFIG_COMP_INTERINTRA_PRED

-  int is_best_interintra = 0;

-  int64_t best_intra16_rd = INT64_MAX;

-  int best_intra16_mode = DC_PRED;

-#if SEPARATE_INTERINTRA_UV

-  int best_intra16_uv_mode = DC_PRED;

-#endif

-#endif

-  int64_t best_overall_rd = INT64_MAX;

-  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;

-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;

-  int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;

-  int uv_intra_skippable = 0;

-  int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;

-  int uv_intra_skippable_8x8 = 0;

-  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);

-  int distortion_uv = INT_MAX;

-  int64_t best_yrd = INT64_MAX;

-  MB_PREDICTION_MODE uv_intra_mode;

-  MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;

-  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};

-  int saddone = 0;

-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

-  int frame_mdcounts[4][4];

-  YV12_BUFFER_CONFIG yv12_mb[4];

-  unsigned int ref_costs[MAX_REF_FRAMES];

-  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];

-  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,

-                                             cpi->common.y1dc_delta_q);

-  struct scale_factors scale_factor[4];

-  vpx_memset(mode8x8, 0, sizeof(mode8x8));

-  vpx_memset(&frame_mv, 0, sizeof(frame_mv));

-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

-  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));

-  vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,

-             sizeof(PICK_MODE_CONTEXT));

-  for (i = 0; i < MAX_REF_FRAMES; i++)

-    frame_mv[NEWMV][i].as_int = INVALID_MV;

-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)

-    best_pred_rd[i] = INT64_MAX;

-  for (i = 0; i < NB_TXFM_MODES; i++)

-    best_txfm_rd[i] = INT64_MAX;

-  for (i = 0; i < NB_PARTITIONINGS; i++) {

-    int j, k;

-    for (j = 0; j < 16; j++)

-      for (k = 0; k < MAX_REF_FRAMES - 1; k++)

-        seg_mvs[i][j][k].as_int = INVALID_MV;

-  }

-  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,

-                       LAST_FRAME, BLOCK_16X16, mb_row, mb_col,

-                       frame_mv[NEARESTMV], frame_mv[NEARMV],

-                       frame_mdcounts, yv12_mb, scale_factor);

-  }

-  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,

-                       GOLDEN_FRAME, BLOCK_16X16, mb_row, mb_col,

-                       frame_mv[NEARESTMV], frame_mv[NEARMV],

-                       frame_mdcounts, yv12_mb, scale_factor);

-  }

-  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,

-                       ALTREF_FRAME, BLOCK_16X16, mb_row, mb_col,

-                       frame_mv[NEARESTMV], frame_mv[NEARMV],

-                       frame_mdcounts, yv12_mb, scale_factor);

-  }

-  *returnintra = INT64_MAX;

-  mbmi->ref_frame = INTRA_FRAME;

-  /* Initialize zbin mode boost for uv costing */

-  cpi->zbin_mode_boost = 0;

-  vp9_update_zbin_extra(cpi, x);

-  xd->mode_info_context->mbmi.mode = DC_PRED;

-  rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,

-                          &uv_intra_rate_tokenonly, &uv_intra_distortion,

-                          &uv_intra_skippable);

-  uv_intra_mode = mbmi->uv_mode;

-  /* rough estimate for now */

-  if (cpi->common.txfm_mode != ONLY_4X4) {

-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,

-                                &uv_intra_rate_tokenonly_8x8,

-                                &uv_intra_distortion_8x8,

-                                &uv_intra_skippable_8x8);

-    uv_intra_mode_8x8 = mbmi->uv_mode;

-  }

-  // Get estimates of reference frame costs for each reference frame

-  // that depend on the current prediction etc.

-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);

-  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {

-    int64_t this_rd = INT64_MAX;

-    int disable_skip = 0, skippable = 0;

-    int other_cost = 0;

-    int compmode_cost = 0;

-#if CONFIG_COMP_INTERINTRA_PRED

-    int compmode_interintra_cost = 0;

-#endif

-    int mode_excluded = 0;

-    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };

-    YV12_BUFFER_CONFIG *scaled_ref_frame;

-    // These variables hold are rolling total cost and distortion for this mode

-    rate2 = 0;

-    distortion2 = 0;

-    rate_y = 0;

-    rate_uv = 0;

-    x->skip = 0;

-    this_mode = vp9_mode_order[mode_index].mode;

-    mbmi->mode = this_mode;

-    mbmi->uv_mode = DC_PRED;

-    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;

-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;

-    mbmi->interp_filter = cm->mcomp_filter_type;

-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

-                      scale_factor);

-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-    // Test best rd so far against threshold for trying this mode.

-    if (best_rd <= cpi->rd_threshes[mode_index])

-      continue;

-    // Ensure that the references used by this mode are available.

-    if (mbmi->ref_frame &&

-        !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame]))

-      continue;

-    if (mbmi->second_ref_frame > 0 &&

-        !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))

-      continue;

-    // only scale on zeromv.

-    if (mbmi->ref_frame > 0 &&

-          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||

-           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&

-        this_mode != ZEROMV)

-      continue;

-    if (mbmi->second_ref_frame > 0 &&

-          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||

-           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&

-        this_mode != ZEROMV)

-      continue;

-    // current coding mode under rate-distortion optimization test loop

-#if CONFIG_COMP_INTERINTRA_PRED

-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-    // If the segment reference frame feature is enabled....

-    // then do nothing if the current ref frame is not allowed..

-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {

-      continue;

-    // If the segment skip feature is enabled....

-    // then do nothing if the current mode is not allowed..

-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&

-               (this_mode != ZEROMV)) {

-      continue;

-    // Disable this drop out case if  the ref frame segment

-    // level feature is enabled for this segment. This is to

-    // prevent the possibility that the we end up unable to pick any mode.

-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {

-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay,

-      // unless ARNR filtering is enabled in which case we want

-      // an unfiltered alternative

-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {

-        if (this_mode != ZEROMV ||

-            mbmi->ref_frame != ALTREF_FRAME) {

-          continue;

-        }

-      }

-    }

-    /* everything but intra */

-    scaled_ref_frame = NULL;

-    if (mbmi->ref_frame) {

-      int ref = mbmi->ref_frame;

-      int fb;

-      xd->pre = yv12_mb[ref];

-      best_ref_mv = mbmi->ref_mvs[ref][0];

-      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));

-      if (mbmi->ref_frame == LAST_FRAME) {

-        fb = cpi->lst_fb_idx;

-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {

-        fb = cpi->gld_fb_idx;

-      } else {

-        fb = cpi->alt_fb_idx;

-      }

-      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])

-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];

-    }

-    if (mbmi->second_ref_frame > 0) {

-      int ref = mbmi->second_ref_frame;

-      xd->second_pre = yv12_mb[ref];

-      second_best_ref_mv = mbmi->ref_mvs[ref][0];

-    }

-    // Experimental code. Special case for gf and arf zeromv modes.

-    // Increase zbin size to suppress noise

-    if (cpi->zbin_mode_boost_enabled) {

-      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)

-        cpi->zbin_mode_boost = 0;

-      else {

-        if (vp9_mode_order[mode_index].mode == ZEROMV) {

-          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)

-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

-          else

-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

-        } else if (vp9_mode_order[mode_index].mode == SPLITMV)

-          cpi->zbin_mode_boost = 0;

-        else

-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

-      }

-      vp9_update_zbin_extra(cpi, x);

-    }

-    // Intra

-    if (!mbmi->ref_frame) {

-      switch (this_mode) {

-        default:

-        case V_PRED:

-        case H_PRED:

-        case D45_PRED:

-        case D135_PRED:

-        case D117_PRED:

-        case D153_PRED:

-        case D27_PRED:

-        case D63_PRED:

-          rate2 += intra_cost_penalty;

-        case DC_PRED:

-        case TM_PRED:

-          mbmi->ref_frame = INTRA_FRAME;

-          // FIXME compound intra prediction

-          vp9_build_intra_predictors_mby(&x->e_mbd);

-          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);

-          rate2 += rate_y;

-          distortion2 += distortion;

-          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];

-          if (mbmi->txfm_size != TX_4X4) {

-            rate2 += uv_intra_rate_8x8;

-            rate_uv = uv_intra_rate_tokenonly_8x8;

-            distortion2 += uv_intra_distortion_8x8;

-            distortion_uv = uv_intra_distortion_8x8;

-            skippable = skippable && uv_intra_skippable_8x8;

-          } else {

-            rate2 += uv_intra_rate;

-            rate_uv = uv_intra_rate_tokenonly;

-            distortion2 += uv_intra_distortion;

-            distortion_uv = uv_intra_distortion;

-            skippable = skippable && uv_intra_skippable;

-          }

-          break;

-        case B_PRED: {

-          int64_t tmp_rd;

-          // Note the rate value returned here includes the cost of coding

-          // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];

-          mbmi->txfm_size = TX_4X4;

-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,

-                                             &distortion, best_yrd);

-          rate2 += rate;

-          rate2 += intra_cost_penalty;

-          distortion2 += distortion;

-          if (tmp_rd < best_yrd) {

-            rate2 += uv_intra_rate;

-            rate_uv = uv_intra_rate_tokenonly;

-            distortion2 += uv_intra_distortion;

-            distortion_uv = uv_intra_distortion;

-          } else {

-            this_rd = INT64_MAX;

-            disable_skip = 1;

-          }

-        }

-        break;

-        case I8X8_PRED: {

-          int64_t tmp_rd;

-          tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y,

-                                                      &distortion, mode8x8,

-                                                      best_yrd, txfm_cache);

-          rate2 += rate;

-          rate2 += intra_cost_penalty;

-          distortion2 += distortion;

-          /* TODO: uv rate maybe over-estimated here since there is UV intra

-                   mode coded in I8X8_PRED prediction */

-          if (tmp_rd < best_yrd) {

-            rate2 += uv_intra_rate;

-            rate_uv = uv_intra_rate_tokenonly;

-            distortion2 += uv_intra_distortion;

-            distortion_uv = uv_intra_distortion;

-          } else {

-            this_rd = INT64_MAX;

-            disable_skip = 1;

-          }

-        }

-        break;

-      }

-    }

-    // Split MV. The code is very different from the other inter modes so

-    // special case it.

-    else if (this_mode == SPLITMV) {

-      const int is_comp_pred = mbmi->second_ref_frame > 0;

-      int64_t this_rd_thresh;

-      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;

-      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;

-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;

-      int switchable_filter_index;

-      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;

-      union b_mode_info tmp_best_bmodes[16];

-      MB_MODE_INFO tmp_best_mbmode;

-      PARTITION_INFO tmp_best_partition;

-      int pred_exists = 0;

-      this_rd_thresh =

-          (mbmi->ref_frame == LAST_FRAME) ?

-          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];

-      this_rd_thresh =

-          (mbmi->ref_frame == GOLDEN_FRAME) ?

-          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;

-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-      for (switchable_filter_index = 0;

-           switchable_filter_index < VP9_SWITCHABLE_FILTERS;

-           ++switchable_filter_index) {

-        int newbest;

-        mbmi->interp_filter =

-            vp9_switchable_interp[switchable_filter_index];

-        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,

-                                             second_ref, best_yrd, mdcounts,

-                                             &rate, &rate_y, &distortion,

-                                             &skippable,

-                                             (int)this_rd_thresh, seg_mvs,

-                                             txfm_cache);

-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

-                   [vp9_get_pred_context(&cpi->common, xd,

-                                         PRED_SWITCHABLE_INTERP)]

-                   [vp9_switchable_interp_map[mbmi->interp_filter]];

-          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);

-        }

-        newbest = (tmp_rd < tmp_best_rd);

-        if (newbest) {

-          tmp_best_filter = mbmi->interp_filter;

-          tmp_best_rd = tmp_rd;

-        }

-        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||

-            (mbmi->interp_filter == cm->mcomp_filter_type &&

-             cm->mcomp_filter_type != SWITCHABLE)) {

-          tmp_best_rdu = tmp_rd;

-          tmp_best_rate = rate;

-          tmp_best_ratey = rate_y;

-          tmp_best_distortion = distortion;

-          tmp_best_skippable = skippable;

-          vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));

-          vpx_memcpy(&tmp_best_partition, x->partition_info,

-                     sizeof(PARTITION_INFO));

-          for (i = 0; i < 16; i++) {

-            tmp_best_bmodes[i] = xd->block[i].bmi;

-          }

-          pred_exists = 1;

-        }

-      }  // switchable_filter_index loop

-      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?

-                             tmp_best_filter : cm->mcomp_filter_type);

-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-      if (!pred_exists) {

-        // Handles the special case when a filter that is not in the

-        // switchable list (bilinear, 6-tap) is indicated at the frame level

-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,

-                                             second_ref, best_yrd, mdcounts,

-                                             &rate, &rate_y, &distortion,

-                                             &skippable,

-                                             (int)this_rd_thresh, seg_mvs,

-                                             txfm_cache);

-      } else {

-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

-                   [vp9_get_pred_context(&cpi->common, xd,

-                                         PRED_SWITCHABLE_INTERP)]

-                   [vp9_switchable_interp_map[mbmi->interp_filter]];

-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);

-        }

-        tmp_rd = tmp_best_rdu;

-        rate = tmp_best_rate;

-        rate_y = tmp_best_ratey;

-        distortion = tmp_best_distortion;

-        skippable = tmp_best_skippable;

-        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));

-        vpx_memcpy(x->partition_info, &tmp_best_partition,

-                   sizeof(PARTITION_INFO));

-        for (i = 0; i < 16; i++) {

-          xd->block[i].bmi = xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];

-        }

-      }

-      rate2 += rate;

-      distortion2 += distortion;

-      if (cpi->common.mcomp_filter_type == SWITCHABLE)

-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]

-            [vp9_switchable_interp_map[mbmi->interp_filter]];

-      // If even the 'Y' rd value of split is higher than best so far

-      // then dont bother looking at UV

-      if (tmp_rd < best_yrd) {

-        int uv_skippable;

-        vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);

-        vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                          x->e_mbd.predictor, x->src.uv_stride);

-        rd_inter16x16_uv_4x4(cpi, x, &rate_uv, &distortion_uv,

-                             cpi->common.full_pixel, &uv_skippable, 1);

-        rate2 += rate_uv;

-        distortion2 += distortion_uv;

-        skippable = skippable && uv_skippable;

-      } else {

-        this_rd = INT64_MAX;

-        disable_skip = 1;

-      }

-      if (!mode_excluded) {

-        if (is_comp_pred)

-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;

-        else

-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;

-      }

-      compmode_cost =

-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);

-      mbmi->mode = this_mode;

-    }

-    else {

-#if CONFIG_COMP_INTERINTRA_PRED

-      if (mbmi->second_ref_frame == INTRA_FRAME) {

-        if (best_intra16_mode == DC_PRED - 1) continue;

-        mbmi->interintra_mode = best_intra16_mode;

-#if SEPARATE_INTERINTRA_UV

-        mbmi->interintra_uv_mode = best_intra16_uv_mode;

-#else

-        mbmi->interintra_uv_mode = best_intra16_mode;

-#endif

-      }

-#endif

-      this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,

-                                  &saddone, near_sadidx, mdcounts, txfm_cache,

-                                  &rate2, &distortion2, &skippable,

-                                  &compmode_cost,

-#if CONFIG_COMP_INTERINTRA_PRED

-                                  &compmode_interintra_cost,

-#endif

-                                  &rate_y, &distortion,

-                                  &rate_uv, &distortion_uv,

-                                  &mode_excluded, &disable_skip,

-                                  mode_index, &tmp_best_filter, frame_mv,

-                                  scaled_ref_frame, mb_row, mb_col);

-      if (this_rd == INT64_MAX)

-        continue;

-    }

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (cpi->common.use_interintra)

-      rate2 += compmode_interintra_cost;

-#endif

-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)

-      rate2 += compmode_cost;

-    // Estimate the reference frame signaling cost and add it

-    // to the rolling cost variable.

-    rate2 += ref_costs[mbmi->ref_frame];

-    if (!disable_skip) {

-      // Test for the condition where skip block will be activated

-      // because there are no non zero coefficients and make any

-      // necessary adjustment for rate. Ignore if skip is coded at

-      // segment level as the cost wont have been added in.

-      if (cpi->common.mb_no_coeff_skip) {

-        int mb_skip_allowed;

-        // Is Mb level skip allowed (i.e. not coded at segment level).

-        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

-        if (skippable) {

-          mbmi->mb_skip_coeff = 1;

-          // Back out the coefficient coding costs

-          rate2 -= (rate_y + rate_uv);

-          // for best_yrd calculation

-          rate_uv = 0;

-          if (mb_skip_allowed) {

-            int prob_skip_cost;

-            // Cost the skip mb case

-            vp9_prob skip_prob =

-              vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);

-            if (skip_prob) {

-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);

-              rate2 += prob_skip_cost;

-              other_cost += prob_skip_cost;

-            }

-          }

-        }

-        // Add in the cost of the no skip flag.

-        else {

-          mbmi->mb_skip_coeff = 0;

-          if (mb_skip_allowed) {

-            int prob_skip_cost = vp9_cost_bit(

-                   vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);

-            rate2 += prob_skip_cost;

-            other_cost += prob_skip_cost;

-          }

-        }

-      }

-      // Calculate the final RD estimate for this mode.

-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

-    }

-    // Keep record of best intra distortion

-    if ((mbmi->ref_frame == INTRA_FRAME) &&

-        (this_rd < best_intra_rd)) {

-      best_intra_rd = this_rd;

-      *returnintra = distortion2;

-    }

-#if CONFIG_COMP_INTERINTRA_PRED

-    if ((mbmi->ref_frame == INTRA_FRAME) &&

-        (this_mode <= TM_PRED) &&

-        (this_rd < best_intra16_rd)) {

-      best_intra16_rd = this_rd;

-      best_intra16_mode = this_mode;

-#if SEPARATE_INTERINTRA_UV

-      best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?

-                              uv_intra_mode_8x8 : uv_intra_mode);

-#endif

-    }

-#endif

-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)

-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)

-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);

-    if (this_rd < best_overall_rd) {

-      best_overall_rd = this_rd;

-      best_filter = tmp_best_filter;

-      best_mode = this_mode;

-#if CONFIG_COMP_INTERINTRA_PRED

-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);

-#endif

-    }

-    // Did this mode help.. i.e. is it the new best mode

-    if (this_rd < best_rd || x->skip) {

-      if (!mode_excluded) {

-        /*

-        if (mbmi->second_ref_frame == INTRA_FRAME) {

-          printf("rd %d best %d bestintra16 %d\n", this_rd, best_rd, best_intra16_rd);

-        }

-        */

-        // Note index of best mode so far

-        best_mode_index = mode_index;

-        if (this_mode <= B_PRED) {

-          if (mbmi->txfm_size != TX_4X4

-              && this_mode != B_PRED

-              && this_mode != I8X8_PRED)

-            mbmi->uv_mode = uv_intra_mode_8x8;

-          else

-            mbmi->uv_mode = uv_intra_mode;

-          /* required for left and above block mv */

-          mbmi->mv[0].as_int = 0;

-        }

-        other_cost += ref_costs[mbmi->ref_frame];

-        /* Calculate the final y RD estimate for this mode */

-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),

-                          (distortion2 - distortion_uv));

-        *returnrate = rate2;

-        *returndistortion = distortion2;

-        best_rd = this_rd;

-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));

-        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));

-        if ((this_mode == B_PRED)

-            || (this_mode == I8X8_PRED)

-            || (this_mode == SPLITMV))

-          for (i = 0; i < 16; i++) {

-            best_bmodes[i] = xd->block[i].bmi;

-          }

-      }

-      // Testing this mode gave rise to an improvement in best error score.

-      // Lower threshold a bit for next time

-      cpi->rd_thresh_mult[mode_index] =

-          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?

-          cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;

-      cpi->rd_threshes[mode_index] =

-          (cpi->rd_baseline_thresh[mode_index] >> 7) *

-          cpi->rd_thresh_mult[mode_index];

-    } else {

-      // If the mode did not help improve the best error case then raise the

-      // threshold for testing that mode next time around.

-      cpi->rd_thresh_mult[mode_index] += 4;

-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)

-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7)

-          * cpi->rd_thresh_mult[mode_index];

-    }

-    /* keep record of best compound/single-only prediction */

-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {

-      int64_t single_rd, hybrid_rd;

-      int single_rate, hybrid_rate;

-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

-        single_rate = rate2 - compmode_cost;

-        hybrid_rate = rate2;

-      } else {

-        single_rate = rate2;

-        hybrid_rate = rate2 + compmode_cost;

-      }

-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);

-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);

-      if (mbmi->second_ref_frame <= INTRA_FRAME &&

-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {

-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;

-      } else if (mbmi->second_ref_frame > INTRA_FRAME &&

-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {

-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;

-      }

-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])

-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;

-    }

-    /* keep record of best txfm size */

-    if (!mode_excluded && this_rd != INT64_MAX) {

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        int64_t adj_rd;

-        if (this_mode != B_PRED) {

-          const int64_t txfm_mode_diff =

-              txfm_cache[i] - txfm_cache[cm->txfm_mode];

-          adj_rd = this_rd + txfm_mode_diff;

-        } else {

-          adj_rd = this_rd;

-        }

-        if (adj_rd < best_txfm_rd[i])

-          best_txfm_rd[i] = adj_rd;

-      }

-    }

-    if (x->skip && !mode_excluded)

-      break;

-  }

-  assert((cm->mcomp_filter_type == SWITCHABLE) ||

-         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||

-         (best_mbmode.mode <= B_PRED));

-#if CONFIG_COMP_INTERINTRA_PRED

-  ++cpi->interintra_select_count[is_best_interintra];

-#endif

-  // Accumulate filter usage stats

-  // TODO(agrange): Use RD criteria to select interpolation filter mode.

-  if ((best_mode >= NEARESTMV) && (best_mode <= SPLITMV))

-    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];

-  // Reduce the activation RD thresholds for the best choice mode

-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&

-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {

-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);

-    cpi->rd_thresh_mult[best_mode_index] =

-        (cpi->rd_thresh_mult[best_mode_index] >=

-         (MIN_THRESHMULT + best_adjustment)) ?

-        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;

-    cpi->rd_threshes[best_mode_index] =

-        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *

-        cpi->rd_thresh_mult[best_mode_index];

-  }

-  // This code forces Altref,0,0 and skip for the frame that overlays a

-  // an alrtef unless Altref is filtered. However, this is unsafe if

-  // segment level coding of ref frame is enabled for this

-  // segment.

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-      cpi->is_src_frame_alt_ref &&

-      (cpi->oxcf.arnr_max_frames == 0) &&

-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

-    mbmi->mode = ZEROMV;

-    if (cm->txfm_mode <= ALLOW_8X8)

-      mbmi->txfm_size = cm->txfm_mode;

-    else

-      mbmi->txfm_size = TX_16X16;

-    mbmi->ref_frame = ALTREF_FRAME;

-    mbmi->mv[0].as_int = 0;

-    mbmi->uv_mode = DC_PRED;

-    mbmi->mb_skip_coeff =

-      (cpi->common.mb_no_coeff_skip) ? 1 : 0;

-    mbmi->partitioning = 0;

-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

-                      scale_factor);

-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));

-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

-    goto end;

-  }

-  // macroblock modes

-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));

-  if (best_mbmode.mode == B_PRED) {

-    for (i = 0; i < 16; i++) {

-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;

-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;

-    }

-  }

-  if (best_mbmode.mode == I8X8_PRED)

-    set_i8x8_block_modes(x, mode8x8);

-  if (best_mbmode.mode == SPLITMV) {

-    for (i = 0; i < 16; i++)

-      xd->mode_info_context->bmi[i].as_mv[0].as_int =

-          best_bmodes[i].as_mv[0].as_int;

-    if (mbmi->second_ref_frame > 0)

-      for (i = 0; i < 16; i++)

-        xd->mode_info_context->bmi[i].as_mv[1].as_int =

-            best_bmodes[i].as_mv[1].as_int;

-    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));

-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;

-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;

-  }

-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {

-    if (best_pred_rd[i] == INT64_MAX)

-      best_pred_diff[i] = INT_MIN;

-    else

-      best_pred_diff[i] = best_rd - best_pred_rd[i];

-  }

-  if (!x->skip) {

-    for (i = 0; i < NB_TXFM_MODES; i++) {

-      if (best_txfm_rd[i] == INT64_MAX)

-        best_txfm_diff[i] = 0;

-      else

-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];

-    }

-  } else {

-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

-  }

-end:

-  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

-                    scale_factor);

-  store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],

-                       best_mode_index, &best_partition,

-                       &mbmi->ref_mvs[mbmi->ref_frame][0],

-                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :

-                                      mbmi->second_ref_frame][0],

-                       best_pred_diff, best_txfm_diff);

-}

-void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

-                                 int *returnrate,

-                                 int *returndist) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

   int rate_y = 0, rate_uv;

   int rate_y_tokenonly = 0, rate_uv_tokenonly;

   int dist_y = 0, dist_uv;

   int y_skip = 0, uv_skip;

   int64_t txfm_cache[NB_TXFM_MODES], err;

+  MB_PREDICTION_MODE mode;

+  TX_SIZE txfm_size;

+  int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;

+  int64_t err4x4 = INT64_MAX;

   int i;

+  vpx_memset(&txfm_cache,0,sizeof(txfm_cache));

+  ctx->skip = 0;

   xd->mode_info_context->mbmi.mode = DC_PRED;

+  xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;

   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,

-                               &dist_y, &y_skip, txfm_cache);

+                               &dist_y, &y_skip, bsize, txfm_cache);

+  mode = xd->mode_info_context->mbmi.mode;

+  txfm_size = xd->mode_info_context->mbmi.txfm_size;

   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,

-                          &dist_uv, &uv_skip);

+                          &dist_uv, &uv_skip,

+                          (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :

+                                                       bsize);

+  if (bsize < BLOCK_SIZE_SB8X8)

+    err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,

+                                       &rate4x4_y_tokenonly,

+                                       &dist4x4_y, err);

-  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {

+  if (y_skip && uv_skip) {

     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +

                   vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

     *returndist = dist_y + (dist_uv >> 2);

-    memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0,

-           sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));

+    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));

+    xd->mode_info_context->mbmi.mode = mode;

+    xd->mode_info_context->mbmi.txfm_size = txfm_size;

+  } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) {

+    *returnrate = rate4x4_y + rate_uv +

+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

+    *returndist = dist4x4_y + (dist_uv >> 2);

+    vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));

+    xd->mode_info_context->mbmi.txfm_size = TX_4X4;

   } else {

-    *returnrate = rate_y + rate_uv;

-    if (cpi->common.mb_no_coeff_skip)

-      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

+    *returnrate = rate_y + rate_uv +

+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

     *returndist = dist_y + (dist_uv >> 2);

     for (i = 0; i < NB_TXFM_MODES; i++) {

-      x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i];

+      ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];

+    xd->mode_info_context->mbmi.txfm_size = txfm_size;

+    xd->mode_info_context->mbmi.mode = mode;

-}

-void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

-                                 int *returnrate,

-                                 int *returndist) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int rate_y = 0, rate_uv;

-  int rate_y_tokenonly = 0, rate_uv_tokenonly;

-  int dist_y = 0, dist_uv;

-  int y_skip = 0, uv_skip;

-  int64_t txfm_cache[NB_TXFM_MODES], err;

-  int i;

-  xd->mode_info_context->mbmi.mode = DC_PRED;

-  err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,

-                                 &dist_y, &y_skip, txfm_cache);

-  rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,

-                            &dist_uv, &uv_skip);

-  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {

-    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +

-    vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

-    *returndist = dist_y + (dist_uv >> 2);

-    memset(x->sb64_context.txfm_rd_diff, 0,

-           sizeof(x->sb64_context.txfm_rd_diff));

-  } else {

-    *returnrate = rate_y + rate_uv;

-    if (cm->mb_no_coeff_skip)

-      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

-    *returndist = dist_y + (dist_uv >> 2);

-    for (i = 0; i < NB_TXFM_MODES; i++) {

-      x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i];

-    }

-  }

+  ctx->mic = *xd->mode_info_context;

-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                            int *returnrate, int *returndist) {

+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                                  int mi_row, int mi_col,

+                                  int *returnrate,

+                                  int *returndistortion,

+                                  BLOCK_SIZE_TYPE bsize,

+                                  PICK_MODE_CONTEXT *ctx) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int64_t error4x4, error16x16;

-  int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;

-  int dist4x4 = 0, dist16x16 = 0, distuv = 0, distuv8x8 = 0;

-  int rate;

-  int rate4x4_tokenonly = 0;

-  int rate16x16_tokenonly = 0;

-  int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;

-  int64_t error8x8;

-  int rate8x8_tokenonly=0;

-  int rate8x8, dist8x8;

-  int mode16x16;

-  int mode8x8[4];

-  int dist;

-  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;

-  int y_intra16x16_skippable = 0;

-  int64_t txfm_cache[2][NB_TXFM_MODES];

-  TX_SIZE txfm_size_16x16, txfm_size_8x8;

-  int i;

-  mbmi->ref_frame = INTRA_FRAME;

-  mbmi->mode = DC_PRED;

-  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,

-                          &uv_intra_skippable);

-  modeuv = mbmi->uv_mode;

-  if (cpi->common.txfm_mode != ONLY_4X4) {

-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,

-                                &distuv8x8, &uv_intra_skippable_8x8);

-    modeuv8x8 = mbmi->uv_mode;

-  } else {

-    uv_intra_skippable_8x8 = uv_intra_skippable;

-    rateuv8x8 = rateuv;

-    distuv8x8 = distuv;

-    rateuv8x8_tokenonly = rateuv_tokenonly;

-    modeuv8x8 = modeuv;

-  }

-  // current macroblock under rate-distortion optimization test loop

-  error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,

-                                          &rate16x16_tokenonly, &dist16x16,

-                                          &y_intra16x16_skippable,

-                                          txfm_cache[1]);

-  mode16x16 = mbmi->mode;

-  txfm_size_16x16 = mbmi->txfm_size;

-  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&

-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||

-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {

-    error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);

-    rate16x16 -= rate16x16_tokenonly;

-  }

-  for (i = 0; i < NB_TXFM_MODES; i++) {

-    txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] +

-                       txfm_cache[1][i];

-  }

-  error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8,

-                                                &rate8x8_tokenonly,

-                                                &dist8x8, mode8x8,

-                                                error16x16, txfm_cache[1]);

-  txfm_size_8x8 = mbmi->txfm_size;

-  for (i = 0; i < NB_TXFM_MODES; i++) {

-    int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i];

-    if (tmp_rd < txfm_cache[0][i])

-      txfm_cache[0][i] = tmp_rd;

-  }

-  mbmi->txfm_size = TX_4X4;

-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,

-                                       &rate4x4, &rate4x4_tokenonly,

-                                       &dist4x4, error16x16);

-  for (i = 0; i < NB_TXFM_MODES; i++) {

-    if (error4x4 < txfm_cache[0][i])

-      txfm_cache[0][i] = error4x4;

-  }

-  mbmi->mb_skip_coeff = 0;

-  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&

-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||

-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {

-    mbmi->mb_skip_coeff = 1;

-    mbmi->mode = mode16x16;

-    mbmi->uv_mode = (cm->txfm_mode == ONLY_4X4) ? modeuv : modeuv8x8;

-    rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

-    dist = dist16x16;

-    if (cm->txfm_mode == ONLY_4X4) {

-      rate += rateuv - rateuv_tokenonly;

-      dist += (distuv >> 2);

-    } else {

-      rate += rateuv8x8 - rateuv8x8_tokenonly;

-      dist += (distuv8x8 >> 2);

-    }

-    mbmi->txfm_size = txfm_size_16x16;

-  } else if (error8x8 > error16x16) {

-    if (error4x4 < error16x16) {

-      rate = rateuv + rate4x4;

-      mbmi->mode = B_PRED;

-      mbmi->txfm_size = TX_4X4;

-      dist = dist4x4 + (distuv >> 2);

-    } else {

-      mbmi->txfm_size = txfm_size_16x16;

-      mbmi->mode = mode16x16;

-      rate = rate16x16 + rateuv8x8;

-      dist = dist16x16 + (distuv8x8 >> 2);

-    }

-    if (cpi->common.mb_no_coeff_skip)

-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

-  } else {

-    if (error4x4 < error8x8) {

-      rate = rateuv + rate4x4;

-      mbmi->mode = B_PRED;

-      mbmi->txfm_size = TX_4X4;

-      dist = dist4x4 + (distuv >> 2);

-    } else {

-      mbmi->mode = I8X8_PRED;

-      mbmi->txfm_size = txfm_size_8x8;

-      set_i8x8_block_modes(x, mode8x8);

-      rate = rate8x8 + rateuv;

-      dist = dist8x8 + (distuv >> 2);

-    }

-    if (cpi->common.mb_no_coeff_skip)

-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

-  }

-  for (i = 0; i < NB_TXFM_MODES; i++) {

-    x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =

-        txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i];

-  }

-  *returnrate = rate;

-  *returndist = dist;

-}

-static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                                         int mb_row, int mb_col,

-                                         int *returnrate,

-                                         int *returndistortion,

-                                         int block_size) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);

   MB_PREDICTION_MODE this_mode;

   MB_PREDICTION_MODE best_mode = DC_PRED;

   MV_REFERENCE_FRAME ref_frame;

@@ -5186,8 +2445,8 @@

   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;

   int comp_pred, i;

   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

-  int frame_mdcounts[4][4];

-  YV12_BUFFER_CONFIG yv12_mb[4];

+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];

+  int_mv single_newmv[MAX_REF_FRAMES];

   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,

                                     VP9_ALT_FLAG };

   int idx_list[4] = {0,

@@ -5194,9 +2453,6 @@

                      cpi->lst_fb_idx,

                      cpi->gld_fb_idx,

                      cpi->alt_fb_idx};

-  int mdcounts[4];

-  int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };

-  int saddone = 0;

   int64_t best_rd = INT64_MAX;

   int64_t best_txfm_rd[NB_TXFM_MODES];

   int64_t best_txfm_diff[NB_TXFM_MODES];

@@ -5205,32 +2461,44 @@

   MB_MODE_INFO best_mbmode;

   int j;

   int mode_index, best_mode_index = 0;

-  unsigned int ref_costs[MAX_REF_FRAMES];

-#if CONFIG_COMP_INTERINTRA_PRED

-  int is_best_interintra = 0;

-  int64_t best_intra16_rd = INT64_MAX;

-  int best_intra16_mode = DC_PRED;

-#if SEPARATE_INTERINTRA_UV

-  int best_intra16_uv_mode = DC_PRED;

-#endif

-#endif

+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];

+  vp9_prob comp_mode_p;

   int64_t best_overall_rd = INT64_MAX;

   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;

   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;

-  int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0,

-      rate_uv_tokenonly_8x8 = 0;

-  int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;

-  MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;

-  int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;

-  int dist_uv_16x16 = 0, uv_skip_16x16 = 0;

-  MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;

+  int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];

+  int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];

+  MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];

   struct scale_factors scale_factor[4];

   unsigned int ref_frame_mask = 0;

   unsigned int mode_mask = 0;

+  int64_t mode_distortions[MB_MODE_COUNT] = {-1};

+  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};

+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,

+                                             cpi->common.y_dc_delta_q);

+  int_mv seg_mvs[4][MAX_REF_FRAMES];

+  union b_mode_info best_bmodes[4];

+  PARTITION_INFO best_partition;

+  int bwsl = b_width_log2(bsize);

+  int bws = (1 << bwsl) / 4;  // mode_info step for subsize

+  int bhsl = b_height_log2(bsize);

+  int bhs = (1 << bhsl) / 4;  // mode_info step for subsize

+  for (i = 0; i < 4; i++) {

+    int j;

+    for (j = 0; j < MAX_REF_FRAMES; j++)

+      seg_mvs[i][j].as_int = INVALID_MV;

+  }

+  // Everywhere the flag is set the error is much higher than its neighbors.

+  ctx->frames_with_high_error = 0;

+  ctx->modes_with_high_error = 0;

   xd->mode_info_context->mbmi.segment_id = segment_id;

-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);

+  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,

+                           &comp_mode_p);

   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));

   for (i = 0; i < NB_PREDICTION_TYPES; ++i)

     best_pred_rd[i] = INT64_MAX;

@@ -5237,87 +2505,61 @@

   for (i = 0; i < NB_TXFM_MODES; i++)

     best_txfm_rd[i] = INT64_MAX;

-  // Create a mask set to 1 for each frame used by a smaller resolution.p

-  if (cpi->Speed > 0) {

+  // Create a mask set to 1 for each frame used by a smaller resolution.

+  if (cpi->speed > 0) {

     switch (block_size) {

       case BLOCK_64X64:

         for (i = 0; i < 4; i++) {

           for (j = 0; j < 4; j++) {

-            ref_frame_mask |= (1 << x->mb_context[i][j].mic.mbmi.ref_frame);

-            mode_mask |= (1 << x->mb_context[i][j].mic.mbmi.mode);

+            ref_frame_mask |= x->mb_context[i][j].frames_with_high_error;

+            mode_mask |= x->mb_context[i][j].modes_with_high_error;

         for (i = 0; i < 4; i++) {

-          ref_frame_mask |= (1 << x->sb32_context[i].mic.mbmi.ref_frame);

-          mode_mask |= (1 << x->sb32_context[i].mic.mbmi.mode);

+          ref_frame_mask |= x->sb32_context[i].frames_with_high_error;

+          mode_mask |= x->sb32_context[i].modes_with_high_error;

         break;

       case BLOCK_32X32:

         for (i = 0; i < 4; i++) {

-          ref_frame_mask |= (1

-              << x->mb_context[xd->sb_index][i].mic.mbmi.ref_frame);

-          mode_mask |= (1 << x->mb_context[xd->sb_index][i].mic.mbmi.mode);

+          ref_frame_mask |=

+              x->mb_context[xd->sb_index][i].frames_with_high_error;

+          mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;

         break;

+      default:

+        // Until we handle all block sizes set it to present;

+        ref_frame_mask = 0;

+        mode_mask = 0;

+        break;

+    ref_frame_mask = ~ref_frame_mask;

+    mode_mask = ~mode_mask;

   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {

     if (cpi->ref_frame_flags & flag_list[ref_frame]) {

       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,

-                         mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV],

-                         frame_mdcounts, yv12_mb, scale_factor);

+                         mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],

+                         yv12_mb, scale_factor);

     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;

     frame_mv[ZEROMV][ref_frame].as_int = 0;

-  // Disallow intra if none of the smaller prediction sizes used intra and

-  // speed > 0 ;

-  if (cpi->Speed == 0

-      || ( cpi->Speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {

-    if (block_size == BLOCK_64X64) {

-      mbmi->mode = DC_PRED;

-      if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {

-        mbmi->txfm_size = TX_4X4;

-        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,

-                                  &dist_uv_4x4, &uv_skip_4x4);

-        mode_uv_4x4 = mbmi->uv_mode;

-      }

-      if (cm->txfm_mode != ONLY_4X4) {

-        mbmi->txfm_size = TX_8X8;

-        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,

-                                  &dist_uv_8x8, &uv_skip_8x8);

-        mode_uv_8x8 = mbmi->uv_mode;

-      }

-      if (cm->txfm_mode >= ALLOW_32X32) {

-        mbmi->txfm_size = TX_32X32;

-        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_16x16,

-                                  &rate_uv_tokenonly_16x16, &dist_uv_16x16,

-                                  &uv_skip_16x16);

-        mode_uv_16x16 = mbmi->uv_mode;

-      }

-    } else {

-      assert(block_size == BLOCK_32X32);

-      mbmi->mode = DC_PRED;

-      if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {

-        mbmi->txfm_size = TX_4X4;

-        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,

-                                &dist_uv_4x4, &uv_skip_4x4);

-        mode_uv_4x4 = mbmi->uv_mode;

-      }

-      if (cm->txfm_mode != ONLY_4X4) {

-        mbmi->txfm_size = TX_8X8;

-        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,

-                                &dist_uv_8x8, &uv_skip_8x8);

-        mode_uv_8x8 = mbmi->uv_mode;

-      }

-      if (cm->txfm_mode >= ALLOW_32X32) {

-        mbmi->txfm_size = TX_32X32;

-        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16,

-                                &rate_uv_tokenonly_16x16, &dist_uv_16x16,

-                                &uv_skip_16x16);

-        mode_uv_16x16 = mbmi->uv_mode;

-      }

+  if (cpi->speed == 0

+      || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {

+    mbmi->mode = DC_PRED;

+    mbmi->ref_frame[0] = INTRA_FRAME;

+    for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :

+                      (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :

+                       (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32)));

+         i++) {

+      mbmi->txfm_size = i;

+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],

+                              &dist_uv[i], &skip_uv[i],

+                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :

+                                                           bsize);

+      mode_uv[i] = mbmi->uv_mode;

@@ -5325,33 +2567,39 @@

     int mode_excluded = 0;

     int64_t this_rd = INT64_MAX;

     int disable_skip = 0;

-    int other_cost = 0;

     int compmode_cost = 0;

     int rate2 = 0, rate_y = 0, rate_uv = 0;

     int distortion2 = 0, distortion_y = 0, distortion_uv = 0;

     int skippable;

     int64_t txfm_cache[NB_TXFM_MODES];

-#if CONFIG_COMP_INTERINTRA_PRED

-    int compmode_interintra_cost = 0;

-#endif

+    int i;

+    for (i = 0; i < NB_TXFM_MODES; ++i)

+      txfm_cache[i] = INT64_MAX;

     // Test best rd so far against threshold for trying this mode.

-    if (best_rd <= cpi->rd_threshes[mode_index] ||

-        cpi->rd_threshes[mode_index] == INT_MAX) {

+    if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *

+                     cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||

+        cpi->rd_threshes[bsize][mode_index] == INT_MAX)

       continue;

-    }

+    // Do not allow compound prediction if the segment level reference

+    // frame feature is in use as in this case there can only be one reference.

+    if ((vp9_mode_order[mode_index].second_ref_frame > INTRA_FRAME) &&

+         vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))

+      continue;

     x->skip = 0;

     this_mode = vp9_mode_order[mode_index].mode;

     ref_frame = vp9_mode_order[mode_index].ref_frame;

-    if (!(ref_frame == INTRA_FRAME

-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {

-      continue;

-    }

-    if (cpi->Speed > 0) {

+    if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {

       if (!(ref_frame_mask & (1 << ref_frame))) {

         continue;

+      if (!(mode_mask & (1 << this_mode))) {

+        continue;

+      }

       if (vp9_mode_order[mode_index].second_ref_frame != NONE

           && !(ref_frame_mask

               & (1 << vp9_mode_order[mode_index].second_ref_frame))) {

@@ -5359,17 +2607,41 @@

-    mbmi->ref_frame = ref_frame;

-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;

-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+    mbmi->ref_frame[0] = ref_frame;

+    mbmi->ref_frame[1] = vp9_mode_order[mode_index].second_ref_frame;

+    if (!(ref_frame == INTRA_FRAME

+        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {

+      continue;

+    }

+    if (!(mbmi->ref_frame[1] == NONE

+        || (cpi->ref_frame_flags & flag_list[mbmi->ref_frame[1]]))) {

+      continue;

+    }

+    // TODO(jingning, jkoleszar): scaling reference frame not supported for

+    // SPLITMV.

+    if (mbmi->ref_frame[0] > 0 &&

+          (scale_factor[mbmi->ref_frame[0]].x_scale_fp !=

+           (1 << VP9_REF_SCALE_SHIFT) ||

+           scale_factor[mbmi->ref_frame[0]].y_scale_fp !=

+           (1 << VP9_REF_SCALE_SHIFT)) &&

+        this_mode == SPLITMV)

+      continue;

+    if (mbmi->ref_frame[1] > 0 &&

+          (scale_factor[mbmi->ref_frame[1]].x_scale_fp !=

+           (1 << VP9_REF_SCALE_SHIFT) ||

+           scale_factor[mbmi->ref_frame[1]].y_scale_fp !=

+           (1 << VP9_REF_SCALE_SHIFT)) &&

+        this_mode == SPLITMV)

+      continue;

+    set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],

                       scale_factor);

-    comp_pred = mbmi->second_ref_frame > INTRA_FRAME;

+    comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;

     mbmi->mode = this_mode;

     mbmi->uv_mode = DC_PRED;

-#if CONFIG_COMP_INTERINTRA_PRED

-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

     // Evaluate all sub-pel filters irrespective of whether we can use

     // them for this frame.

@@ -5376,58 +2648,48 @@

     mbmi->interp_filter = cm->mcomp_filter_type;

     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-    // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))

-    //  continue;

-    if (this_mode == I8X8_PRED || this_mode == B_PRED || this_mode == SPLITMV)

+    if (bsize >= BLOCK_SIZE_SB8X8 &&

+        (this_mode == I4X4_PRED || this_mode == SPLITMV))

       continue;

-    //  if (vp9_mode_order[mode_index].second_ref_frame == INTRA_FRAME)

-    //  continue;

+    if (bsize < BLOCK_SIZE_SB8X8 &&

+        !(this_mode == I4X4_PRED || this_mode == SPLITMV))

+      continue;

     if (comp_pred) {

-      int second_ref;

-      if (ref_frame == ALTREF_FRAME) {

-        second_ref = LAST_FRAME;

-      } else {

-        second_ref = ref_frame + 1;

-      }

-      if (!(cpi->ref_frame_flags & flag_list[second_ref]))

+      if (!(cpi->ref_frame_flags & flag_list[mbmi->ref_frame[1]]))

         continue;

-      mbmi->second_ref_frame = second_ref;

-      set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+      set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],

                         scale_factor);

-      xd->second_pre = yv12_mb[second_ref];

       mode_excluded =

           mode_excluded ?

               mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;

     } else {

-      // mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;

+      // mbmi->ref_frame[1] = vp9_mode_order[mode_index].ref_frame[1];

       if (ref_frame != INTRA_FRAME) {

-        if (mbmi->second_ref_frame != INTRA_FRAME)

+        if (mbmi->ref_frame[1] != INTRA_FRAME)

           mode_excluded =

               mode_excluded ?

                   mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;

-#if CONFIG_COMP_INTERINTRA_PRED

-        else

-          mode_excluded = mode_excluded ? mode_excluded : !cm->use_interintra;

-#endif

-    xd->pre = yv12_mb[ref_frame];

-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));

+    // Select predictors

+    for (i = 0; i < MAX_MB_PLANE; i++) {

+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];

+      if (comp_pred)

+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];

+    }

     // If the segment reference frame feature is enabled....

     // then do nothing if the current ref frame is not allowed..

     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-        !vp9_check_segref(xd, segment_id, ref_frame)) {

+        vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {

       continue;

     // If the segment skip feature is enabled....

     // then do nothing if the current mode is not allowed..

     } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&

-               (this_mode != ZEROMV)) {

+               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {

       continue;

     // Disable this drop out case if the ref frame

     // segment level feature is enabled for this segment. This is to

@@ -5442,84 +2704,204 @@

+    // TODO(JBB): This is to make up for the fact that we don't have sad

+    // functions that work when the block size reads outside the umv.  We

+    // should fix this either by making the motion search just work on

+    // a representative block in the boundary ( first ) and then implement a

+    // function that does sads when inside the border..

+    if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&

+        this_mode == NEWMV) {

+      continue;

+    }

-    if (ref_frame == INTRA_FRAME) {

-      if (block_size == BLOCK_64X64) {

-        vp9_build_intra_predictors_sb64y_s(xd);

-        super_block_64_yrd(cpi, x, &rate_y, &distortion_y,

-                           &skippable, txfm_cache);

-      } else {

-        assert(block_size == BLOCK_32X32);

-        vp9_build_intra_predictors_sby_s(xd);

-        super_block_yrd(cpi, x, &rate_y, &distortion_y,

-                        &skippable, txfm_cache);

-      }

-      if (mbmi->txfm_size == TX_4X4) {

-        rate_uv = rate_uv_4x4;

-        distortion_uv = dist_uv_4x4;

-        skippable = skippable && uv_skip_4x4;

-        mbmi->uv_mode = mode_uv_4x4;

-      } else if (mbmi->txfm_size == TX_32X32) {

-        rate_uv = rate_uv_16x16;

-        distortion_uv = dist_uv_16x16;

-        skippable = skippable && uv_skip_16x16;

-        mbmi->uv_mode = mode_uv_16x16;

-      } else {

-        rate_uv = rate_uv_8x8;

-        distortion_uv = dist_uv_8x8;

-        skippable = skippable && uv_skip_8x8;

-        mbmi->uv_mode = mode_uv_8x8;

-      }

+    if (this_mode == I4X4_PRED) {

+      int rate;

-      rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;

+      mbmi->txfm_size = TX_4X4;

+      rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,

+                                &distortion_y, INT64_MAX);

+      rate2 += rate;

+      rate2 += intra_cost_penalty;

+      distortion2 += distortion_y;

+      rate2 += rate_uv_intra[TX_4X4];

+      rate_uv = rate_uv_intra[TX_4X4];

+      distortion2 += dist_uv[TX_4X4];

+      distortion_uv = dist_uv[TX_4X4];

+      mbmi->uv_mode = mode_uv[TX_4X4];

+      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

+      for (i = 0; i < NB_TXFM_MODES; ++i)

+        txfm_cache[i] = txfm_cache[ONLY_4X4];

+    } else if (ref_frame == INTRA_FRAME) {

+      TX_SIZE uv_tx;

+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,

+                      bsize, txfm_cache);

+      uv_tx = mbmi->txfm_size;

+      if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)

+        uv_tx = TX_4X4;

+      if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)

+        uv_tx = TX_8X8;

+      else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)

+        uv_tx = TX_16X16;

+      rate_uv = rate_uv_intra[uv_tx];

+      distortion_uv = dist_uv[uv_tx];

+      skippable = skippable && skip_uv[uv_tx];

+      mbmi->uv_mode = mode_uv[uv_tx];

+      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv;

+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)

+        rate2 += intra_cost_penalty;

       distortion2 = distortion_y + distortion_uv;

-    } else {

-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;

-      int fb;

+    } else if (this_mode == SPLITMV) {

+      const int is_comp_pred = mbmi->ref_frame[1] > 0;

+      int rate, distortion;

+      int64_t this_rd_thresh;

+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;

+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;

+      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;

+      int switchable_filter_index;

+      int_mv *second_ref = is_comp_pred ?

+          &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;

+      union b_mode_info tmp_best_bmodes[16];

+      MB_MODE_INFO tmp_best_mbmode;

+      PARTITION_INFO tmp_best_partition;

+      int pred_exists = 0;

+      int uv_skippable;

-      if (mbmi->ref_frame == LAST_FRAME) {

-        fb = cpi->lst_fb_idx;

-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {

-        fb = cpi->gld_fb_idx;

+      this_rd_thresh = (mbmi->ref_frame[0] == LAST_FRAME) ?

+          cpi->rd_threshes[bsize][THR_NEWMV] :

+          cpi->rd_threshes[bsize][THR_NEWA];

+      this_rd_thresh = (mbmi->ref_frame[0] == GOLDEN_FRAME) ?

+          cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;

+      xd->mode_info_context->mbmi.txfm_size = TX_4X4;

+      for (switchable_filter_index = 0;

+           switchable_filter_index < VP9_SWITCHABLE_FILTERS;

+           ++switchable_filter_index) {

+        int newbest;

+        mbmi->interp_filter =

+        vp9_switchable_interp[switchable_filter_index];

+        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,

+                     &mbmi->ref_mvs[mbmi->ref_frame[0]][0],

+                     second_ref, INT64_MAX,

+                     &rate, &rate_y, &distortion,

+                     &skippable,

+                     (int)this_rd_thresh, seg_mvs,

+                     mi_row, mi_col);

+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+          const int rs = get_switchable_rate(cm, x);

+          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);

+        }

+        newbest = (tmp_rd < tmp_best_rd);

+        if (newbest) {

+          tmp_best_filter = mbmi->interp_filter;

+          tmp_best_rd = tmp_rd;

+        }

+        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||

+            (mbmi->interp_filter == cm->mcomp_filter_type &&

+             cm->mcomp_filter_type != SWITCHABLE)) {

+              tmp_best_rdu = tmp_rd;

+              tmp_best_rate = rate;

+              tmp_best_ratey = rate_y;

+              tmp_best_distortion = distortion;

+              tmp_best_skippable = skippable;

+              tmp_best_mbmode = *mbmi;

+              tmp_best_partition = *x->partition_info;

+              for (i = 0; i < 4; i++)

+                tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];

+              pred_exists = 1;

+            }

+      }  // switchable_filter_index loop

+      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?

+                             tmp_best_filter : cm->mcomp_filter_type);

+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+      if (!pred_exists) {

+        // Handles the special case when a filter that is not in the

+        // switchable list (bilinear, 6-tap) is indicated at the frame level

+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,

+                     &mbmi->ref_mvs[mbmi->ref_frame[0]][0],

+                     second_ref, INT64_MAX,

+                     &rate, &rate_y, &distortion,

+                     &skippable,

+                     (int)this_rd_thresh, seg_mvs,

+                     mi_row, mi_col);

       } else {

-        fb = cpi->alt_fb_idx;

+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+          int rs = get_switchable_rate(cm, x);

+          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);

+        }

+        tmp_rd = tmp_best_rdu;

+        rate = tmp_best_rate;

+        rate_y = tmp_best_ratey;

+        distortion = tmp_best_distortion;

+        skippable = tmp_best_skippable;

+        *mbmi = tmp_best_mbmode;

+        *x->partition_info = tmp_best_partition;

+        for (i = 0; i < 4; i++)

+          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];

+      rate2 += rate;

+      distortion2 += distortion;

+      if (cpi->common.mcomp_filter_type == SWITCHABLE)

+        rate2 += get_switchable_rate(cm, x);

+      // If even the 'Y' rd value of split is higher than best so far

+      // then dont bother looking at UV

+      vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,

+                                      BLOCK_SIZE_SB8X8);

+      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);

+      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,

+                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);

+      rate2 += rate_uv;

+      distortion2 += distortion_uv;

+      skippable = skippable && uv_skippable;

+      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

+      for (i = 0; i < NB_TXFM_MODES; ++i)

+        txfm_cache[i] = txfm_cache[ONLY_4X4];

+      if (!mode_excluded) {

+        if (is_comp_pred)

+          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;

+        else

+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;

+      }

+      compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);

+    } else {

+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};

+      int fb = get_ref_frame_idx(cpi, mbmi->ref_frame[0]);

       if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])

-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];

+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];

-#if CONFIG_COMP_INTERINTRA_PRED

-      if (mbmi->second_ref_frame == INTRA_FRAME) {

-        if (best_intra16_mode == DC_PRED - 1) continue;

-        mbmi->interintra_mode = best_intra16_mode;

-#if SEPARATE_INTERINTRA_UV

-        mbmi->interintra_uv_mode = best_intra16_uv_mode;

-#else

-        mbmi->interintra_uv_mode = best_intra16_mode;

-#endif

+      if (comp_pred) {

+        fb = get_ref_frame_idx(cpi, mbmi->ref_frame[1]);

+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])

+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];

-#endif

-      this_rd = handle_inter_mode(cpi, x, block_size,

-                                  &saddone, near_sadidx, mdcounts, txfm_cache,

+      compmode_cost = vp9_cost_bit(comp_mode_p,

+                                   mbmi->ref_frame[1] > INTRA_FRAME);

+      this_rd = handle_inter_mode(cpi, x, bsize,

+                                  txfm_cache,

                                   &rate2, &distortion2, &skippable,

-                                  &compmode_cost,

-#if CONFIG_COMP_INTERINTRA_PRED

-                                  &compmode_interintra_cost,

-#endif

                                   &rate_y, &distortion_y,

                                   &rate_uv, &distortion_uv,

                                   &mode_excluded, &disable_skip,

-                                  mode_index, &tmp_best_filter, frame_mv,

-                                  scaled_ref_frame, mb_row, mb_col);

+                                  &tmp_best_filter, frame_mv[this_mode],

+                                  scaled_ref_frame, mi_row, mi_col,

+                                  single_newmv);

       if (this_rd == INT64_MAX)

         continue;

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (cpi->common.use_interintra) {

-      rate2 += compmode_interintra_cost;

-    }

-#endif

     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

       rate2 += compmode_cost;

@@ -5526,7 +2908,11 @@

     // Estimate the reference frame signaling cost and add it

     // to the rolling cost variable.

-    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];

+    if (mbmi->ref_frame[1] > INTRA_FRAME) {

+      rate2 += ref_costs_comp[mbmi->ref_frame[0]];

+    } else {

+      rate2 += ref_costs_single[mbmi->ref_frame[0]];

+    }

     if (!disable_skip) {

       // Test for the condition where skip block will be activated

@@ -5533,39 +2919,34 @@

       // because there are no non zero coefficients and make any

       // necessary adjustment for rate. Ignore if skip is coded at

       // segment level as the cost wont have been added in.

-      if (cpi->common.mb_no_coeff_skip) {

-        int mb_skip_allowed;

+      int mb_skip_allowed;

-        // Is Mb level skip allowed (i.e. not coded at segment level).

-        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

+      // Is Mb level skip allowed (i.e. not coded at segment level).

+      mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

-        if (skippable) {

-          // Back out the coefficient coding costs

-          rate2 -= (rate_y + rate_uv);

-          // for best_yrd calculation

-          rate_uv = 0;

+      if (skippable && bsize >= BLOCK_SIZE_SB8X8) {

+        // Back out the coefficient coding costs

+        rate2 -= (rate_y + rate_uv);

+        // for best_yrd calculation

+        rate_uv = 0;

-          if (mb_skip_allowed) {

-            int prob_skip_cost;

+        if (mb_skip_allowed) {

+          int prob_skip_cost;

-            // Cost the skip mb case

-            vp9_prob skip_prob =

-              vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

+          // Cost the skip mb case

+          vp9_prob skip_prob =

+            vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

-            if (skip_prob) {

-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);

-              rate2 += prob_skip_cost;

-              other_cost += prob_skip_cost;

-            }

+          if (skip_prob) {

+            prob_skip_cost = vp9_cost_bit(skip_prob, 1);

+            rate2 += prob_skip_cost;

+      } else if (mb_skip_allowed) {

         // Add in the cost of the no skip flag.

-        else if (mb_skip_allowed) {

-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,

-                                                          PRED_MBSKIP), 0);

-          rate2 += prob_skip_cost;

-          other_cost += prob_skip_cost;

-        }

+        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,

+                                                        PRED_MBSKIP), 0);

+        rate2 += prob_skip_cost;

       // Calculate the final RD estimate for this mode.

@@ -5574,26 +2955,14 @@

 #if 0

     // Keep record of best intra distortion

-    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&

+    if ((xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) &&

         (this_rd < best_intra_rd)) {

       best_intra_rd = this_rd;

       *returnintra = distortion2;

 #endif

-#if CONFIG_COMP_INTERINTRA_PRED

-    if ((mbmi->ref_frame == INTRA_FRAME) &&

-        (this_mode <= TM_PRED) &&

-        (this_rd < best_intra16_rd)) {

-      best_intra16_rd = this_rd;

-      best_intra16_mode = this_mode;

-#if SEPARATE_INTERINTRA_UV

-      best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?

-                              mode_uv_8x8 : mode_uv_4x4);

-#endif

-    }

-#endif

-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)

+    if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME)

       for (i = 0; i < NB_PREDICTION_TYPES; ++i)

         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);

@@ -5601,11 +2970,20 @@

       best_overall_rd = this_rd;

       best_filter = tmp_best_filter;

       best_mode = this_mode;

-#if CONFIG_COMP_INTERINTRA_PRED

-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);

-#endif

+    if (this_mode != I4X4_PRED && this_mode != SPLITMV) {

+      // Store the respective mode distortions for later use.

+      if (mode_distortions[this_mode] == -1

+          || distortion2 < mode_distortions[this_mode]) {

+        mode_distortions[this_mode] = distortion2;

+      }

+      if (frame_distortions[mbmi->ref_frame[0]] == -1

+          || distortion2 < frame_distortions[mbmi->ref_frame[0]]) {

+        frame_distortions[mbmi->ref_frame[0]] = distortion2;

+      }

+    }

     // Did this mode help.. i.e. is it the new best mode

     if (this_rd < best_rd || x->skip) {

       if (!mode_excluded) {

@@ -5612,16 +2990,20 @@

         // Note index of best mode so far

         best_mode_index = mode_index;

-        if (this_mode <= B_PRED) {

+        if (ref_frame == INTRA_FRAME) {

           /* required for left and above block mv */

           mbmi->mv[0].as_int = 0;

-        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];

         *returnrate = rate2;

         *returndistortion = distortion2;

         best_rd = this_rd;

-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));

+        best_mbmode = *mbmi;

+        best_partition = *x->partition_info;

+        if (this_mode == I4X4_PRED || this_mode == SPLITMV)

+          for (i = 0; i < 4; i++)

+            best_bmodes[i] = xd->mode_info_context->bmi[i];

 #if 0

       // Testing this mode gave rise to an improvement in best error score.

@@ -5649,7 +3031,7 @@

     /* keep record of best compound/single-only prediction */

-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {

+    if (!disable_skip && mbmi->ref_frame[0] != INTRA_FRAME) {

       int single_rd, hybrid_rd, single_rate, hybrid_rate;

       if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

@@ -5663,10 +3045,10 @@

       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);

       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);

-      if (mbmi->second_ref_frame <= INTRA_FRAME &&

+      if (mbmi->ref_frame[1] <= INTRA_FRAME &&

           single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {

         best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;

-      } else if (mbmi->second_ref_frame > INTRA_FRAME &&

+      } else if (mbmi->ref_frame[1] > INTRA_FRAME &&

                  single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {

         best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;

@@ -5675,14 +3057,23 @@

     /* keep record of best txfm size */

+    if (bsize < BLOCK_SIZE_SB32X32) {

+      if (bsize < BLOCK_SIZE_MB16X16) {

+        if (this_mode == SPLITMV || this_mode == I4X4_PRED)

+          txfm_cache[ALLOW_8X8] = txfm_cache[ONLY_4X4];

+        txfm_cache[ALLOW_16X16] = txfm_cache[ALLOW_8X8];

+      }

+      txfm_cache[ALLOW_32X32] = txfm_cache[ALLOW_16X16];

+    }

     if (!mode_excluded && this_rd != INT64_MAX) {

       for (i = 0; i < NB_TXFM_MODES; i++) {

-        int64_t adj_rd;

-        if (this_mode != B_PRED) {

+        int64_t adj_rd = INT64_MAX;

+        if (this_mode != I4X4_PRED) {

           adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];

         } else {

           adj_rd = this_rd;

         if (adj_rd < best_txfm_rd[i])

           best_txfm_rd[i] = adj_rd;

@@ -5691,22 +3082,61 @@

     if (x->skip && !mode_excluded)

       break;

+  // Flag all modes that have a distortion thats > 2x the best we found at

+  // this level.

+  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {

+    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)

+      continue;

+    if (mode_distortions[mode_index] > 2 * *returndistortion) {

+      ctx->modes_with_high_error |= (1 << mode_index);

+    }

+  }

+  // Flag all ref frames that have a distortion thats > 2x the best we found at

+  // this level.

+  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {

+    if (frame_distortions[ref_frame] > 2 * *returndistortion) {

+      ctx->frames_with_high_error |= (1 << ref_frame);

+    }

+  }

+  if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {

+    *returnrate = INT_MAX;

+    *returndistortion = INT_MAX;

+    return best_rd;

+  }

   assert((cm->mcomp_filter_type == SWITCHABLE) ||

          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||

-         (best_mbmode.mode <= B_PRED));

+         (best_mbmode.ref_frame[0] == INTRA_FRAME));

-#if CONFIG_COMP_INTERINTRA_PRED

-  ++cpi->interintra_select_count[is_best_interintra];

-  // if (is_best_interintra)  printf("best_interintra\n");

-#endif

   // Accumulate filter usage stats

   // TODO(agrange): Use RD criteria to select interpolation filter mode.

-  if ((best_mode >= NEARESTMV) && (best_mode <= SPLITMV))

+  if (is_inter_mode(best_mode))

     ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];

-  // TODO(rbultje) integrate with RD thresholding

+  // Updating rd_thresh_freq_fact[] here means that the differnt

+  // partition/block sizes are handled independently based on the best

+  // choice for the current partition. It may well be better to keep a scaled

+  // best rd so far value and update rd_thresh_freq_fact based on the mode/size

+  // combination that wins out.

+  if (cpi->sf.adpative_rd_thresh) {

+    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {

+      if (mode_index == best_mode_index) {

+        cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;

+      } else {

+        cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;

+        if (cpi->rd_thresh_freq_fact[bsize][mode_index] >

+            (cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {

+          cpi->rd_thresh_freq_fact[bsize][mode_index] =

+            cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT;

+        }

+      }

+    }

+  }

+  // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding

 #if 0

   // Reduce the activation RD thresholds for the best choice mode

   if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&

@@ -5727,16 +3157,22 @@

   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

       cpi->is_src_frame_alt_ref &&

       (cpi->oxcf.arnr_max_frames == 0) &&

-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame[0] != ALTREF_FRAME)

+      && bsize >= BLOCK_SIZE_SB8X8) {

     mbmi->mode = ZEROMV;

-    mbmi->ref_frame = ALTREF_FRAME;

-    mbmi->second_ref_frame = INTRA_FRAME;

+    mbmi->ref_frame[0] = ALTREF_FRAME;

+    mbmi->ref_frame[1] = NONE;

     mbmi->mv[0].as_int = 0;

     mbmi->uv_mode = DC_PRED;

-    mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;

-    mbmi->partitioning = 0;

-    mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?

-                      TX_32X32 : cm->txfm_mode;

+    mbmi->mb_skip_coeff = 1;

+    if (cm->txfm_mode == TX_MODE_SELECT) {

+      if (bsize >= BLOCK_SIZE_SB32X32)

+        mbmi->txfm_size = TX_32X32;

+      else if (bsize >= BLOCK_SIZE_MB16X16)

+        mbmi->txfm_size = TX_16X16;

+      else

+        mbmi->txfm_size = TX_8X8;

+    }

     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));

@@ -5744,8 +3180,30 @@

   // macroblock modes

-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));

+  *mbmi = best_mbmode;

+  if (best_mbmode.ref_frame[0] == INTRA_FRAME &&

+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {

+    for (i = 0; i < 4; i++)

+      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;

+  }

+  if (best_mbmode.ref_frame[0] != INTRA_FRAME &&

+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {

+    for (i = 0; i < 4; i++)

+      xd->mode_info_context->bmi[i].as_mv[0].as_int =

+          best_bmodes[i].as_mv[0].as_int;

+    if (mbmi->ref_frame[1] > 0)

+      for (i = 0; i < 4; i++)

+        xd->mode_info_context->bmi[i].as_mv[1].as_int =

+            best_bmodes[i].as_mv[1].as_int;

+    *x->partition_info = best_partition;

+    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;

+    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;

+  }

   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {

     if (best_pred_rd[i] == INT64_MAX)

       best_pred_diff[i] = INT_MIN;

@@ -5765,72 +3223,14 @@

  end:

-  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+  set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],

                     scale_factor);

-  {

-    PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ?

-                            &x->sb32_context[xd->sb_index] :

-                            &x->sb64_context;

-    store_coding_context(x, p, best_mode_index, NULL,

-                         &mbmi->ref_mvs[mbmi->ref_frame][0],

-                         &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :

-                             mbmi->second_ref_frame][0],

-                         best_pred_diff, best_txfm_diff);

-  }

+  store_coding_context(x, ctx, best_mode_index,

+                       &best_partition,

+                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],

+                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :

+                                      mbmi->ref_frame[1]][0],

+                       best_pred_diff, best_txfm_diff);

   return best_rd;

-}

-int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int mb_row, int mb_col,

-                                    int *returnrate,

-                                    int *returndistortion) {

-  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,

-                                   returnrate, returndistortion, BLOCK_32X32);

-}

-int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int mb_row, int mb_col,

-                                    int *returnrate,

-                                    int *returndistortion) {

-  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,

-                                   returnrate, returndistortion, BLOCK_64X64);

-}

-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int mb_row, int mb_col,

-                                    int *totalrate, int *totaldist) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int rate, distortion;

-  int64_t intra_error = 0;

-  unsigned char *segment_id = &mbmi->segment_id;

-  if (xd->segmentation_enabled)

-    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];

-  else

-    x->encode_breakout = cpi->oxcf.encode_breakout;

-  // if (cpi->sf.RD)

-  // For now this codebase is limited to a single rd encode path

-  {

-    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;

-    rd_pick_inter_mode(cpi, x, mb_row, mb_col, &rate,

-                       &distortion, &intra_error);

-    /* restore cpi->zbin_mode_boost_enabled */

-    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;

-  }

-  // else

-  // The non rd encode path has been deleted from this code base

-  // to simplify development

-  //    vp9_pick_inter_mode

-  // Store metrics so they can be added in to totals if this mode is picked

-  x->mb_context[xd->sb_index][xd->mb_index].distortion  = distortion;

-  x->mb_context[xd->sb_index][xd->mb_index].intra_error = intra_error;

-  *totalrate = rate;

-  *totaldist = distortion;

--- a/vp9/encoder/vp9_rdopt.h

+++ b/vp9/encoder/vp9_rdopt.h

@@ -19,26 +19,14 @@

 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);

-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                            int *r, int *d);

+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                               int *r, int *d, BLOCK_SIZE_TYPE bsize,

+                               PICK_MODE_CONTEXT *ctx);

-void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

-                                 int *r, int *d);

-void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

-                                 int *r, int *d);

-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int mb_row, int mb_col,

-                                    int *r, int *d);

-int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int mb_row, int mb_col,

-                                    int *r, int *d);

-int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int mb_row, int mb_col,

-                                    int *r, int *d);

+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                                  int mi_row, int mi_col,

+                                  int *r, int *d, BLOCK_SIZE_TYPE bsize,

+                                  PICK_MODE_CONTEXT *ctx);

 void vp9_init_me_luts();

--- a/vp9/encoder/vp9_sad_c.c

+++ b/vp9/encoder/vp9_sad_c.c

@@ -23,6 +23,52 @@

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);

+unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,

+                            int  src_stride,

+                            const uint8_t *ref_ptr,

+                            int  ref_stride,

+                            unsigned int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);

+}

+void vp9_sad64x32x4d_c(const uint8_t *src_ptr,

+                       int  src_stride,

+                       const uint8_t* const ref_ptr[],

+                       int  ref_stride,

+                       unsigned int *sad_array) {

+  sad_array[0] = vp9_sad64x32(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad64x32(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad64x32(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad64x32(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

+}

+unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,

+                            int  src_stride,

+                            const uint8_t *ref_ptr,

+                            int  ref_stride,

+                            unsigned int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);

+}

+void vp9_sad32x64x4d_c(const uint8_t *src_ptr,

+                       int  src_stride,

+                       const uint8_t* const ref_ptr[],

+                       int  ref_stride,

+                       unsigned int *sad_array) {

+  sad_array[0] = vp9_sad32x64(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad32x64(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad32x64(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad32x64(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

+}

 unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,

                             int  src_stride,

                             const uint8_t *ref_ptr,

@@ -31,6 +77,52 @@

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);

+unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,

+                            int   src_stride,

+                            const uint8_t *ref_ptr,

+                            int   ref_stride,

+                            unsigned int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);

+}

+void vp9_sad32x16x4d_c(const uint8_t *src_ptr,

+                       int  src_stride,

+                       const uint8_t* const ref_ptr[],

+                       int  ref_stride,

+                       unsigned int *sad_array) {

+  sad_array[0] = vp9_sad32x16(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad32x16(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad32x16(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad32x16(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

+}

+unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,

+                            int   src_stride,

+                            const uint8_t *ref_ptr,

+                            int   ref_stride,

+                            unsigned int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);

+}

+void vp9_sad16x32x4d_c(const uint8_t *src_ptr,

+                       int  src_stride,

+                       const uint8_t* const ref_ptr[],

+                       int  ref_stride,

+                       unsigned int *sad_array) {

+  sad_array[0] = vp9_sad16x32(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x32(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x32(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad16x32(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

+}

 unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,

                             int  src_stride,

                             const uint8_t *ref_ptr,

@@ -64,7 +156,22 @@

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);

+unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,

+                          int src_stride,

+                          const uint8_t *ref_ptr,

+                          int ref_stride,

+                          unsigned int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);

+}

+unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,

+                          int src_stride,

+                          const uint8_t *ref_ptr,

+                          int ref_stride,

+                          unsigned int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);

+}

 unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,

                           int  src_stride,

                           const uint8_t *ref_ptr,

@@ -469,6 +576,98 @@

                              ref_ptr[2], ref_stride, 0x7fffffff);

   sad_array[3] = vp9_sad8x16(src_ptr, src_stride,

                              ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad8x4x4d_c(const uint8_t *src_ptr,

+                     int  src_stride,

+                     const uint8_t* const ref_ptr[],

+                     int  ref_stride,

+                     unsigned int *sad_array) {

+  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,

+                            ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,

+                            ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,

+                            ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,

+                            ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad8x4x8_c(const uint8_t *src_ptr,

+                     int  src_stride,

+                     const uint8_t *ref_ptr,

+                     int  ref_stride,

+                     uint32_t *sad_array) {

+  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr, ref_stride,

+                             0x7fffffff);

+  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr + 1, ref_stride,

+                             0x7fffffff);

+  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr + 2, ref_stride,

+                             0x7fffffff);

+  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr + 3, ref_stride,

+                             0x7fffffff);

+  sad_array[4] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr + 4, ref_stride,

+                             0x7fffffff);

+  sad_array[5] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr + 5, ref_stride,

+                             0x7fffffff);

+  sad_array[6] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr + 6, ref_stride,

+                             0x7fffffff);

+  sad_array[7] = vp9_sad8x4(src_ptr, src_stride,

+                             ref_ptr + 7, ref_stride,

+                             0x7fffffff);

+}

+void vp9_sad4x8x4d_c(const uint8_t *src_ptr,

+                     int  src_stride,

+                     const uint8_t* const ref_ptr[],

+                     int  ref_stride,

+                     unsigned int *sad_array) {

+  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,

+                            ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,

+                            ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,

+                            ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,

+                            ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad4x8x8_c(const uint8_t *src_ptr,

+                     int  src_stride,

+                     const uint8_t *ref_ptr,

+                     int  ref_stride,

+                     uint32_t *sad_array) {

+  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr, ref_stride,

+                             0x7fffffff);

+  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr + 1, ref_stride,

+                             0x7fffffff);

+  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr + 2, ref_stride,

+                             0x7fffffff);

+  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr + 3, ref_stride,

+                             0x7fffffff);

+  sad_array[4] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr + 4, ref_stride,

+                             0x7fffffff);

+  sad_array[5] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr + 5, ref_stride,

+                             0x7fffffff);

+  sad_array[6] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr + 6, ref_stride,

+                             0x7fffffff);

+  sad_array[7] = vp9_sad4x8(src_ptr, src_stride,

+                             ref_ptr + 7, ref_stride,

+                             0x7fffffff);

 void vp9_sad4x4x4d_c(const uint8_t *src_ptr,

--- a/vp9/encoder/vp9_segmentation.c

+++ b/vp9/encoder/vp9_segmentation.c

@@ -15,54 +15,9 @@

 #include "vp9/common/vp9_pred_common.h"

 #include "vp9/common/vp9_tile_common.h"

-void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {

-  int mb_row, mb_col;

-  MODE_INFO *this_mb_mode_info = cm->mi;

-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;

-  if ((cm->frame_type == KEY_FRAME) || (cpi->refresh_golden_frame)) {

-    // Reset Gf useage monitors

-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

-  } else {

-    // for each macroblock row in image

-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-      // for each macroblock col in image

-      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-        // If using golden then set GF active flag if not already set.

-        // If using last frame 0,0 mode then leave flag as it is

-        // else if using non 0,0 motion or intra modes then clear

-        // flag if it is currently set

-        if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||

-            (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {

-          if (*(x->gf_active_ptr) == 0) {

-            *(x->gf_active_ptr) = 1;

-            cpi->gf_active_count++;

-          }

-        } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&

-                   *(x->gf_active_ptr)) {

-          *(x->gf_active_ptr) = 0;

-          cpi->gf_active_count--;

-        }

-        x->gf_active_ptr++;          // Step onto next entry

-        this_mb_mode_info++;         // skip to next mb

-      }

-      // this is to account for the border

-      this_mb_mode_info++;

-    }

-  }

-}

 void vp9_enable_segmentation(VP9_PTR ptr) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  VP9_COMP *cpi = (VP9_COMP *)ptr;

-  // Set the appropriate feature bit

   cpi->mb.e_mbd.segmentation_enabled = 1;

   cpi->mb.e_mbd.update_mb_segmentation_map = 1;

   cpi->mb.e_mbd.update_mb_segmentation_data = 1;

@@ -69,9 +24,7 @@

 void vp9_disable_segmentation(VP9_PTR ptr) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  // Clear the appropriate feature bit

+  VP9_COMP *cpi = (VP9_COMP *)ptr;

   cpi->mb.e_mbd.segmentation_enabled = 0;

@@ -81,7 +34,7 @@

   // Copy in the new segmentation map

   vpx_memcpy(cpi->segmentation_map, segmentation_map,

-             (cpi->common.mb_rows * cpi->common.mb_cols));

+             (cpi->common.mi_rows * cpi->common.mi_cols));

   // Signal that the map should be updated.

   cpi->mb.e_mbd.update_mb_segmentation_map = 1;

@@ -104,104 +57,59 @@

 // Based on set of segment counts calculate a probability tree

-static void calc_segtree_probs(MACROBLOCKD *xd,

-                               int *segcounts,

+static void calc_segtree_probs(MACROBLOCKD *xd, int *segcounts,

                                vp9_prob *segment_tree_probs) {

-  int count1, count2;

-  // Total count for all segments

-  count1 = segcounts[0] + segcounts[1];

-  count2 = segcounts[2] + segcounts[3];

   // Work out probabilities of each segment

-  segment_tree_probs[0] = get_binary_prob(count1, count2);

-  segment_tree_probs[1] = get_prob(segcounts[0], count1);

-  segment_tree_probs[2] = get_prob(segcounts[2], count2);

+  const int c01 = segcounts[0] + segcounts[1];

+  const int c23 = segcounts[2] + segcounts[3];

+  const int c45 = segcounts[4] + segcounts[5];

+  const int c67 = segcounts[6] + segcounts[7];

+  segment_tree_probs[0] = get_binary_prob(c01 + c23, c45 + c67);

+  segment_tree_probs[1] = get_binary_prob(c01, c23);

+  segment_tree_probs[2] = get_binary_prob(c45, c67);

+  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);

+  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);

+  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);

+  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);

 // Based on set of segment counts and probabilities calculate a cost estimate

-static int cost_segmap(MACROBLOCKD *xd,

-                       int *segcounts,

-                       vp9_prob *probs) {

-  int cost;

-  int count1, count2;

+static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) {

+  const int c01 = segcounts[0] + segcounts[1];

+  const int c23 = segcounts[2] + segcounts[3];

+  const int c45 = segcounts[4] + segcounts[5];

+  const int c67 = segcounts[6] + segcounts[7];

+  const int c0123 = c01 + c23;

+  const int c4567 = c45 + c67;

   // Cost the top node of the tree

-  count1 = segcounts[0] + segcounts[1];

-  count2 = segcounts[2] + segcounts[3];

-  cost = count1 * vp9_cost_zero(probs[0]) +

-         count2 * vp9_cost_one(probs[0]);

+  int cost = c0123 * vp9_cost_zero(probs[0]) +

+             c4567 * vp9_cost_one(probs[0]);

-  // Now add the cost of each individual segment branch

-  if (count1 > 0)

-    cost += segcounts[0] * vp9_cost_zero(probs[1]) +

-            segcounts[1] * vp9_cost_one(probs[1]);

+  // Cost subsequent levels

+  if (c0123 > 0) {

+    cost += c01 * vp9_cost_zero(probs[1]) +

+            c23 * vp9_cost_one(probs[1]);

-  if (count2 > 0)

-    cost += segcounts[2] * vp9_cost_zero(probs[2]) +

-            segcounts[3] * vp9_cost_one(probs[2]);

+    if (c01 > 0)

+      cost += segcounts[0] * vp9_cost_zero(probs[3]) +

+              segcounts[1] * vp9_cost_one(probs[3]);

+    if (c23 > 0)

+      cost += segcounts[2] * vp9_cost_zero(probs[4]) +

+              segcounts[3] * vp9_cost_one(probs[4]);

+  }

-  return cost;

-}

+  if (c4567 > 0) {

+    cost += c45 * vp9_cost_zero(probs[2]) +

+            c67 * vp9_cost_one(probs[2]);

-// Based on set of segment counts calculate a probability tree

-static void calc_segtree_probs_pred(MACROBLOCKD *xd,

-                                    int (*segcounts)[MAX_MB_SEGMENTS],

-                                    vp9_prob *segment_tree_probs,

-                                    vp9_prob *mod_probs) {

-  int count[4];

-  assert(!segcounts[0][0] && !segcounts[1][1] &&

-         !segcounts[2][2] && !segcounts[3][3]);

-  // Total count for all segments

-  count[0] = segcounts[3][0] + segcounts[1][0] + segcounts[2][0];

-  count[1] = segcounts[2][1] + segcounts[0][1] + segcounts[3][1];

-  count[2] = segcounts[0][2] + segcounts[3][2] + segcounts[1][2];

-  count[3] = segcounts[1][3] + segcounts[2][3] + segcounts[0][3];

-  // Work out probabilities of each segment

-  segment_tree_probs[0] = get_binary_prob(count[0] + count[1],

-                                          count[2] + count[3]);

-  segment_tree_probs[1] = get_binary_prob(count[0], count[1]);

-  segment_tree_probs[2] = get_binary_prob(count[2], count[3]);

-  // now work out modified counts that the decoder would have

-  count[0] =        segment_tree_probs[0]  *        segment_tree_probs[1];

-  count[1] =        segment_tree_probs[0]  * (256 - segment_tree_probs[1]);

-  count[2] = (256 - segment_tree_probs[0]) *        segment_tree_probs[2];

-  count[3] = (256 - segment_tree_probs[0]) * (256 - segment_tree_probs[2]);

-  // Work out modified probabilties depending on what segment was predicted

-  mod_probs[0] = get_binary_prob(count[1], count[2] + count[3]);

-  mod_probs[1] = get_binary_prob(count[0], count[2] + count[3]);

-  mod_probs[2] = get_binary_prob(count[0] + count[1], count[3]);

-  mod_probs[3] = get_binary_prob(count[0] + count[1], count[2]);

-}

-// Based on set of segment counts and probabilities calculate a cost estimate

-static int cost_segmap_pred(MACROBLOCKD *xd,

-                            int (*segcounts)[MAX_MB_SEGMENTS],

-                            vp9_prob *probs, vp9_prob *mod_probs) {

-  int pred_seg, cost = 0;

-  for (pred_seg = 0; pred_seg < MAX_MB_SEGMENTS; pred_seg++) {

-    int count1, count2;

-    // Cost the top node of the tree

-    count1 = segcounts[pred_seg][0] + segcounts[pred_seg][1];

-    count2 = segcounts[pred_seg][2] + segcounts[pred_seg][3];

-    cost += count1 * vp9_cost_zero(mod_probs[pred_seg]) +

-            count2 * vp9_cost_one(mod_probs[pred_seg]);

-    // Now add the cost of each individual segment branch

-    if (pred_seg >= 2 && count1) {

-      cost += segcounts[pred_seg][0] * vp9_cost_zero(probs[1]) +

-              segcounts[pred_seg][1] * vp9_cost_one(probs[1]);

-    } else if (pred_seg < 2 && count2 > 0) {

-      cost += segcounts[pred_seg][2] * vp9_cost_zero(probs[2]) +

-              segcounts[pred_seg][3] * vp9_cost_one(probs[2]);

-    }

+    if (c45 > 0)

+      cost += segcounts[4] * vp9_cost_zero(probs[5]) +

+              segcounts[5] * vp9_cost_one(probs[5]);

+    if (c67 > 0)

+      cost += segcounts[6] * vp9_cost_zero(probs[6]) +

+              segcounts[7] * vp9_cost_one(probs[6]);

   return cost;

@@ -211,16 +119,18 @@

                        MODE_INFO *mi,

                        int *no_pred_segcounts,

                        int (*temporal_predictor_count)[2],

-                       int (*t_unpred_seg_counts)[MAX_MB_SEGMENTS],

-                       int mb_size, int mb_row, int mb_col) {

+                       int *t_unpred_seg_counts,

+                       int bw, int bh, int mi_row, int mi_col) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  const int segmap_index = mb_row * cm->mb_cols + mb_col;

-  const int segment_id = mi->mbmi.segment_id;

+  int segment_id;

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

+    return;

+  segment_id = mi->mbmi.segment_id;

   xd->mode_info_context = mi;

-  set_mb_row(cm, xd, mb_row, mb_size);

-  set_mb_col(cm, xd, mb_col, mb_size);

+  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);

   // Count the number of hits on each segment with no prediction

   no_pred_segcounts[segment_id]++;

@@ -228,7 +138,8 @@

   // Temporal prediction not allowed on key frames

   if (cm->frame_type != KEY_FRAME) {

     // Test to see if the segment id matches the predicted value.

-    const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, segmap_index);

+    const int pred_seg_id = vp9_get_pred_mi_segid(cm, mi->mbmi.sb_type,

+                                                  mi_row, mi_col);

     const int seg_predicted = (segment_id == pred_seg_id);

     // Get the segment id prediction context

@@ -241,10 +152,65 @@

     if (!seg_predicted)

       // Update the "unpredicted" segment count

-      t_unpred_seg_counts[pred_seg_id][segment_id]++;

+      t_unpred_seg_counts[segment_id]++;

+static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,

+                          int *no_pred_segcounts,

+                          int (*temporal_predictor_count)[2],

+                          int *t_unpred_seg_counts,

+                          int mi_row, int mi_col,

+                          BLOCK_SIZE_TYPE bsize) {

+  VP9_COMMON *const cm = &cpi->common;

+  const int mis = cm->mode_info_stride;

+  int bwl, bhl;

+  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);

+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)

+    return;

+  bwl = mi_width_log2(mi->mbmi.sb_type);

+  bhl = mi_height_log2(mi->mbmi.sb_type);

+  if (bwl == bsl && bhl == bsl) {

+    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,

+               t_unpred_seg_counts, 1 << bsl, 1 << bsl, mi_row, mi_col);

+  } else if (bwl == bsl && bhl < bsl) {

+    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,

+               t_unpred_seg_counts, 1 << bsl, bs, mi_row, mi_col);

+    count_segs(cpi, mi + bs * mis, no_pred_segcounts, temporal_predictor_count,

+               t_unpred_seg_counts, 1 << bsl, bs, mi_row + bs, mi_col);

+  } else if (bwl < bsl && bhl == bsl) {

+    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,

+               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col);

+    count_segs(cpi, mi + bs, no_pred_segcounts, temporal_predictor_count,

+               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col + bs);

+  } else {

+    BLOCK_SIZE_TYPE subsize;

+    int n;

+    assert(bwl < bsl && bhl < bsl);

+    if (bsize == BLOCK_SIZE_SB64X64) {

+      subsize = BLOCK_SIZE_SB32X32;

+    } else if (bsize == BLOCK_SIZE_SB32X32) {

+      subsize = BLOCK_SIZE_MB16X16;

+    } else {

+      assert(bsize == BLOCK_SIZE_MB16X16);

+      subsize = BLOCK_SIZE_SB8X8;

+    }

+    for (n = 0; n < 4; n++) {

+      const int y_idx = n >> 1, x_idx = n & 0x01;

+      count_segs_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,

+                    no_pred_segcounts, temporal_predictor_count,

+                    t_unpred_seg_counts,

+                    mi_row + y_idx * bs, mi_col + x_idx * bs, subsize);

+    }

+  }

+}

 void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &cpi->mb.e_mbd;

@@ -253,15 +219,14 @@

   int t_pred_cost = INT_MAX;

   int i;

-  int tile_col, mb_row, mb_col;

+  int tile_col, mi_row, mi_col;

   int temporal_predictor_count[PREDICTION_PROBS][2];

   int no_pred_segcounts[MAX_MB_SEGMENTS];

-  int t_unpred_seg_counts[MAX_MB_SEGMENTS][MAX_MB_SEGMENTS];

+  int t_unpred_seg_counts[MAX_MB_SEGMENTS];

-  vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];

-  vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];

-  vp9_prob t_pred_tree_mod[MAX_MB_SEGMENTS];

+  vp9_prob no_pred_tree[MB_SEG_TREE_PROBS];

+  vp9_prob t_pred_tree[MB_SEG_TREE_PROBS];

   vp9_prob t_nopred_prob[PREDICTION_PROBS];

   const int mis = cm->mode_info_stride;

@@ -269,10 +234,8 @@

   // Set default state for the segment tree probabilities and the

   // temporal coding probabilities

-  vpx_memset(xd->mb_segment_tree_probs, 255,

-             sizeof(xd->mb_segment_tree_probs));

-  vpx_memset(cm->segment_pred_probs, 255,

-             sizeof(cm->segment_pred_probs));

+  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));

+  vpx_memset(cm->segment_pred_probs, 255, sizeof(cm->segment_pred_probs));

   vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));

   vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));

@@ -280,53 +243,17 @@

   // First of all generate stats regarding how well the last segment map

   // predicts this one

   for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {

     vp9_get_tile_col_offsets(cm, tile_col);

-    mi_ptr = cm->mi + cm->cur_tile_mb_col_start;

-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {

+    mi_ptr = cm->mi + cm->cur_tile_mi_col_start;

+    for (mi_row = 0; mi_row < cm->mi_rows;

+         mi_row += 8, mi_ptr += 8 * mis) {

       mi = mi_ptr;

-      for (mb_col = cm->cur_tile_mb_col_start;

-           mb_col < cm->cur_tile_mb_col_end; mb_col += 4, mi += 4) {

-        if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-          count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,

-                     t_unpred_seg_counts, 4, mb_row, mb_col);

-        } else {

-          for (i = 0; i < 4; i++) {

-            int x_idx = (i & 1) << 1, y_idx = i & 2;

-            MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;

-            if (mb_col + x_idx >= cm->mb_cols ||

-                mb_row + y_idx >= cm->mb_rows) {

-              continue;

-            }

-            if (sb_mi->mbmi.sb_type) {

-              assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);

-              count_segs(cpi, sb_mi, no_pred_segcounts,

-                         temporal_predictor_count, t_unpred_seg_counts, 2,

-                         mb_row + y_idx, mb_col + x_idx);

-            } else {

-              int j;

-              for (j = 0; j < 4; j++) {

-                const int x_idx_mb = x_idx + (j & 1);

-                const int y_idx_mb = y_idx + (j >> 1);

-                MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;

-                if (mb_col + x_idx_mb >= cm->mb_cols ||

-                    mb_row + y_idx_mb >= cm->mb_rows) {

-                  continue;

-                }

-                assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);

-                count_segs(cpi, mb_mi, no_pred_segcounts,

-                           temporal_predictor_count, t_unpred_seg_counts,

-                           1, mb_row + y_idx_mb, mb_col + x_idx_mb);

-              }

-            }

-          }

-        }

+      for (mi_col = cm->cur_tile_mi_col_start;

+           mi_col < cm->cur_tile_mi_col_end;

+           mi_col += 8, mi += 8) {

+        count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,

+                      t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);

@@ -340,21 +267,19 @@

   if (cm->frame_type != KEY_FRAME) {

     // Work out probability tree for coding those segments not

     // predicted using the temporal method and the cost.

-    calc_segtree_probs_pred(xd, t_unpred_seg_counts, t_pred_tree,

-                            t_pred_tree_mod);

-    t_pred_cost = cost_segmap_pred(xd, t_unpred_seg_counts, t_pred_tree,

-                                   t_pred_tree_mod);

+    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);

+    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);

     // Add in the cost of the signalling for each prediction context

     for (i = 0; i < PREDICTION_PROBS; i++) {

-      t_nopred_prob[i] = get_binary_prob(temporal_predictor_count[i][0],

-                                         temporal_predictor_count[i][1]);

+      const int count0 = temporal_predictor_count[i][0];

+      const int count1 = temporal_predictor_count[i][1];

+      t_nopred_prob[i] = get_binary_prob(count0, count1);

       // Add in the predictor signaling cost

-      t_pred_cost += (temporal_predictor_count[i][0] *

-                      vp9_cost_zero(t_nopred_prob[i])) +

-                     (temporal_predictor_count[i][1] *

-                      vp9_cost_one(t_nopred_prob[i]));

+      t_pred_cost += count0 * vp9_cost_zero(t_nopred_prob[i]) +

+                     count1 * vp9_cost_one(t_nopred_prob[i]);

@@ -361,15 +286,10 @@

   // Now choose which coding method to use.

   if (t_pred_cost < no_pred_cost) {

     cm->temporal_update = 1;

-    vpx_memcpy(xd->mb_segment_tree_probs,

-               t_pred_tree, sizeof(t_pred_tree));

-    vpx_memcpy(xd->mb_segment_mispred_tree_probs,

-               t_pred_tree_mod, sizeof(t_pred_tree_mod));

-    vpx_memcpy(&cm->segment_pred_probs,

-               t_nopred_prob, sizeof(t_nopred_prob));

+    vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree));

+    vpx_memcpy(cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob));

   } else {

     cm->temporal_update = 0;

-    vpx_memcpy(xd->mb_segment_tree_probs,

-               no_pred_tree, sizeof(no_pred_tree));

+    vpx_memcpy(xd->mb_segment_tree_probs, no_pred_tree, sizeof(no_pred_tree));

--- a/vp9/encoder/vp9_segmentation.h

+++ b/vp9/encoder/vp9_segmentation.h

@@ -15,8 +15,6 @@

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/encoder/vp9_onyx_int.h"

-void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x);

 void vp9_enable_segmentation(VP9_PTR ptr);

 void vp9_disable_segmentation(VP9_PTR ptr);

--- a/vp9/encoder/vp9_temporal_filter.c

+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -26,7 +26,6 @@

 #include "vp9/common/vp9_quant_common.h"

 #include "vp9/encoder/vp9_segmentation.h"

 #include "vpx_mem/vpx_mem.h"

-#include "vp9/common/vp9_swapyv12buffer.h"

 #include "vpx_ports/vpx_timer.h"

 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering

@@ -41,22 +40,17 @@

                                             int mv_col,

                                             uint8_t *pred) {

   const int which_mv = 0;

-  int_mv subpel_mv;

-  int_mv fullpel_mv;

+  int_mv mv;

-  subpel_mv.as_mv.row = mv_row;

-  subpel_mv.as_mv.col = mv_col;

-  // TODO(jkoleszar): Make this rounding consistent with the rest of the code

-  fullpel_mv.as_mv.row = (mv_row >> 1) & ~7;

-  fullpel_mv.as_mv.col = (mv_col >> 1) & ~7;

+  mv.as_mv.row = mv_row;

+  mv.as_mv.col = mv_col;

   vp9_build_inter_predictor(y_mb_ptr, stride,

                             &pred[0], 16,

-                            &subpel_mv,

+                            &mv,

                             &xd->scale_factor[which_mv],

                             16, 16,

-                            which_mv <<

-                            (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

+                            which_mv,

                             &xd->subpix);

   stride = (stride + 1) >> 1;

@@ -63,20 +57,18 @@

   vp9_build_inter_predictor_q4(u_mb_ptr, stride,

                                &pred[256], 8,

-                               &fullpel_mv, &subpel_mv,

+                               &mv,

                                &xd->scale_factor_uv[which_mv],

                                8, 8,

-                               which_mv <<

-                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

+                               which_mv,

                                &xd->subpix);

   vp9_build_inter_predictor_q4(v_mb_ptr, stride,

                                &pred[320], 8,

-                               &fullpel_mv, &subpel_mv,

+                               &mv,

                                &xd->scale_factor_uv[which_mv],

                                8, 8,

-                               which_mv <<

-                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

+                               which_mv,

                                &xd->subpix);

@@ -126,27 +118,23 @@

 #if ALT_REF_MC_ENABLED

 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,

-                                              YV12_BUFFER_CONFIG *arf_frame,

-                                              YV12_BUFFER_CONFIG *frame_ptr,

-                                              int mb_offset,

+                                              uint8_t *arf_frame_buf,

+                                              uint8_t *frame_ptr_buf,

+                                              int stride,

                                               int error_thresh) {

   MACROBLOCK *x = &cpi->mb;

+  MACROBLOCKD* const xd = &x->e_mbd;

   int step_param;

   int sadpb = x->sadperbit16;

   int bestsme = INT_MAX;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &x->e_mbd.block[0];

   int_mv best_ref_mv1;

   int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */

+  int_mv *ref_mv;

   // Save input state

-  uint8_t **base_src = b->base_src;

-  int src = b->src;

-  int src_stride = b->src_stride;

-  uint8_t **base_pre = d->base_pre;

-  int pre = d->pre;

-  int pre_stride = d->pre_stride;

+  struct buf_2d src = x->plane[0].src;

+  struct buf_2d pre = xd->plane[0].pre[0];

   best_ref_mv1.as_int = 0;

   best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;

@@ -153,26 +141,22 @@

   best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;

   // Setup frame pointers

-  b->base_src = &arf_frame->y_buffer;

-  b->src_stride = arf_frame->y_stride;

-  b->src = mb_offset;

+  x->plane[0].src.buf = arf_frame_buf;

+  x->plane[0].src.stride = stride;

+  xd->plane[0].pre[0].buf = frame_ptr_buf;

+  xd->plane[0].pre[0].stride = stride;

-  d->base_pre = &frame_ptr->y_buffer;

-  d->pre_stride = frame_ptr->y_stride;

-  d->pre = mb_offset;

   // Further step/diamond searches as necessary

-  if (cpi->Speed < 8) {

-    step_param = cpi->sf.first_step +

-                 ((cpi->Speed > 5) ? 1 : 0);

-  } else {

+  if (cpi->speed < 8)

+    step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0);

+  else

     step_param = cpi->sf.first_step + 2;

-  }

   /*cpi->sf.search_method == HEX*/

   // TODO Check that the 16x16 vf & sdf are selected here

   // Ignore mv costing by sending NULL pointer instead of cost arrays

-  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv[0],

+  ref_mv = &x->e_mbd.mode_info_context->bmi[0].as_mv[0];

+  bestsme = vp9_hex_search(x, &best_ref_mv1_full, ref_mv,

                            step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],

                            NULL, NULL, NULL, NULL,

                            &best_ref_mv1);

@@ -184,7 +168,7 @@

     int distortion;

     unsigned int sse;

     // Ignore mv costing by sending NULL pointer instead of cost array

-    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv[0],

+    bestsme = cpi->find_fractional_mv_step(x, ref_mv,

                                            &best_ref_mv1,

                                            x->errorperbit,

                                            &cpi->fn_ptr[BLOCK_16X16],

@@ -193,13 +177,9 @@

 #endif

-  // Save input state

-  b->base_src = base_src;

-  b->src = src;

-  b->src_stride = src_stride;

-  d->base_pre = base_pre;

-  d->pre = pre;

-  d->pre_stride = pre_stride;

+  // Restore input state

+  x->plane[0].src = src;

+  xd->plane[0].pre[0] = pre;

   return bestsme;

@@ -225,10 +205,12 @@

   DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 + 8 * 8 + 8 * 8);

   // Save input state

-  uint8_t *y_buffer = mbd->pre.y_buffer;

-  uint8_t *u_buffer = mbd->pre.u_buffer;

-  uint8_t *v_buffer = mbd->pre.v_buffer;

+  uint8_t* input_buffer[MAX_MB_PLANE];

+  int i;

+  for (i = 0; i < MAX_MB_PLANE; i++)

+    input_buffer[i] = mbd->plane[i].pre[0].buf;

   for (mb_row = 0; mb_row < mb_rows; mb_row++) {

 #if ALT_REF_MC_ENABLED

     // Source frames are extended to 16 pixels.  This is different than

@@ -264,8 +246,8 @@

         if (cpi->frames[frame] == NULL)

           continue;

-        mbd->block[0].bmi.as_mv[0].as_mv.row = 0;

-        mbd->block[0].bmi.as_mv[0].as_mv.col = 0;

+        mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row = 0;

+        mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col = 0;

         if (frame == alt_ref_index) {

           filter_weight = 2;

@@ -278,9 +260,9 @@

           // Find best match in this frame by MC

           err = temporal_filter_find_matching_mb_c

                 (cpi,

-                 cpi->frames[alt_ref_index],

-                 cpi->frames[frame],

-                 mb_y_offset,

+                 cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,

+                 cpi->frames[frame]->y_buffer + mb_y_offset,

+                 cpi->frames[frame]->y_stride,

                  THRESH_LOW);

 #endif

           // Assign higher weight to matching MB if it's error

@@ -298,8 +280,8 @@

            cpi->frames[frame]->u_buffer + mb_uv_offset,

            cpi->frames[frame]->v_buffer + mb_uv_offset,

            cpi->frames[frame]->y_stride,

-           mbd->block[0].bmi.as_mv[0].as_mv.row,

-           mbd->block[0].bmi.as_mv[0].as_mv.col,

+           mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row,

+           mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col,

            predictor);

           // Apply the filter (YUV)

@@ -372,16 +354,15 @@

   // Restore input state

-  mbd->pre.y_buffer = y_buffer;

-  mbd->pre.u_buffer = u_buffer;

-  mbd->pre.v_buffer = v_buffer;

+  for (i = 0; i < MAX_MB_PLANE; i++)

+    mbd->plane[i].pre[0].buf = input_buffer[i];

 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {

+  VP9_COMMON *const cm = &cpi->common;

   int frame = 0;

-  int num_frames_backward = 0;

-  int num_frames_forward = 0;

   int frames_to_blur_backward = 0;

   int frames_to_blur_forward = 0;

   int frames_to_blur = 0;

@@ -391,15 +372,13 @@

   int blur_type = cpi->oxcf.arnr_type;

   int max_frames = cpi->active_arnr_frames;

-  num_frames_backward = distance;

-  num_frames_forward = vp9_lookahead_depth(cpi->lookahead)

-                       - (num_frames_backward + 1);

+  const int num_frames_backward = distance;

+  const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)

+                               - (num_frames_backward + 1);

   switch (blur_type) {

     case 1:

-      /////////////////////////////////////////

       // Backward Blur

       frames_to_blur_backward = num_frames_backward;

       if (frames_to_blur_backward >= max_frames)

@@ -409,7 +388,6 @@

       break;

     case 2:

-      /////////////////////////////////////////

       // Forward Blur

       frames_to_blur_forward = num_frames_forward;

@@ -422,7 +400,6 @@

     case 3:

     default:

-      /////////////////////////////////////////

       // Center Blur

       frames_to_blur_forward = num_frames_forward;

       frames_to_blur_backward = num_frames_backward;

@@ -462,23 +439,91 @@

   // Setup scaling factors. Scaling on each of the arnr frames is not supported

   vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],

-      &cpi->common.yv12_fb[cpi->common.new_fb_idx],

-      cpi->common.width,

-      cpi->common.height);

+      cm->yv12_fb[cm->new_fb_idx].y_crop_width,

+      cm->yv12_fb[cm->new_fb_idx].y_crop_height,

+      cm->width, cm->height);

   cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];

   // Setup frame pointers, NULL indicates frame not included in filter

   vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));

   for (frame = 0; frame < frames_to_blur; frame++) {

-    int which_buffer =  start_frame - frame;

+    int which_buffer = start_frame - frame;

     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,

                                                      which_buffer);

     cpi->frames[frames_to_blur - 1 - frame] = &buf->img;

-  temporal_filter_iterate_c(

-    cpi,

-    frames_to_blur,

-    frames_to_blur_backward,

-    strength);

+  temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,

+                            strength);

+}

+void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,

+                           const int group_boost) {

+  int half_gf_int;

+  int frames_after_arf;

+  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;

+  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;

+  int q;

+  // Define the arnr filter width for this group of frames:

+  // We only filter frames that lie within a distance of half

+  // the GF interval from the ARF frame. We also have to trap

+  // cases where the filter extends beyond the end of clip.

+  // Note: this_frame->frame has been updated in the loop

+  // so it now points at the ARF frame.

+  half_gf_int = cpi->baseline_gf_interval >> 1;

+  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);

+  switch (cpi->oxcf.arnr_type) {

+    case 1:  // Backward filter

+      frames_fwd = 0;

+      if (frames_bwd > half_gf_int)

+        frames_bwd = half_gf_int;

+      break;

+    case 2:  // Forward filter

+      if (frames_fwd > half_gf_int)

+        frames_fwd = half_gf_int;

+      if (frames_fwd > frames_after_arf)

+        frames_fwd = frames_after_arf;

+      frames_bwd = 0;

+      break;

+    case 3:  // Centered filter

+    default:

+      frames_fwd >>= 1;

+      if (frames_fwd > frames_after_arf)

+        frames_fwd = frames_after_arf;

+      if (frames_fwd > half_gf_int)

+        frames_fwd = half_gf_int;

+      frames_bwd = frames_fwd;

+      // For even length filter there is one more frame backward

+      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.

+      if (frames_bwd < half_gf_int)

+        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;

+      break;

+  }

+  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;

+  // Adjust the strength based on active max q

+  q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1);

+  if (q > 8) {

+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength;

+  } else {

+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q);

+    if (cpi->active_arnr_strength < 0)

+      cpi->active_arnr_strength = 0;

+  }

+  // Adjust number of frames in filter and strength based on gf boost level.

+  if (cpi->active_arnr_frames > (group_boost / 150)) {

+    cpi->active_arnr_frames = (group_boost / 150);

+    cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);

+  }

+  if (cpi->active_arnr_strength > (group_boost / 300)) {

+    cpi->active_arnr_strength = (group_boost / 300);

+  }

--- a/vp9/encoder/vp9_temporal_filter.h

+++ b/vp9/encoder/vp9_temporal_filter.h

@@ -12,5 +12,7 @@

 #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_

 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);

+void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,

+                           const int group_boost);

 #endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_

--- a/vp9/encoder/vp9_tokenize.c

+++ b/vp9/encoder/vp9_tokenize.c

@@ -25,31 +25,12 @@

    compressions, then generating vp9_context.c = initial stats. */

 #ifdef ENTROPY_STATS

-vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];

-vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];

-vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];

-vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];

-extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];

-extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];

-extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];

-extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];

+vp9_coeff_accum context_counters[TX_SIZE_MAX_SB][BLOCK_TYPES];

+extern vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];

 #endif  /* ENTROPY_STATS */

-#if CONFIG_CODE_NONZEROCOUNT

-#ifdef NZC_STATS

-unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                           [NZC4X4_TOKENS];

-unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                           [NZC8X8_TOKENS];

-unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                             [NZC16X16_TOKENS];

-unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

-                             [NZC32X32_TOKENS];

-unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]

-                            [NZC_BITS_EXTRA][2];

-#endif

-#endif

+DECLARE_ALIGNED(16, extern const uint8_t,

+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);

 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];

 const TOKENVALUE *vp9_dct_value_tokens_ptr;

@@ -59,7 +40,7 @@

 static void fill_value_tokens() {

   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;

-  vp9_extra_bit_struct *const e = vp9_extra_bits;

+  vp9_extra_bit *const e = vp9_extra_bits;

   int i = -DCT_MAX_VALUE;

   int sign = 1;

@@ -77,25 +58,25 @@

         while (++j < 11  &&  e[j].base_val <= a) {}

-        t[i].Token = --j;

+        t[i].token = --j;

         eb |= (a - e[j].base_val) << 1;

       } else

-        t[i].Token = a;

+        t[i].token = a;

-      t[i].Extra = eb;

+      t[i].extra = eb;

     // initialize the cost for extra bits for all possible coefficient value.

       int cost = 0;

-      vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;

+      vp9_extra_bit *p = vp9_extra_bits + t[i].token;

       if (p->base_val) {

-        const int extra = t[i].Extra;

-        const int Length = p->Len;

+        const int extra = t[i].extra;

+        const int length = p->len;

-        if (Length)

-          cost += treed_cost(p->tree, p->prob, extra >> 1, Length);

+        if (length)

+          cost += treed_cost(p->tree, p->prob, extra >> 1, length);

         cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */

         dct_value_cost[i + DCT_MAX_VALUE] = cost;

@@ -111,139 +92,99 @@

 extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);

-static void tokenize_b(VP9_COMP *cpi,

-                       MACROBLOCKD *xd,

-                       const int ib,

-                       TOKENEXTRA **tp,

-                       PLANE_TYPE type,

-                       TX_SIZE tx_size,

-                       int dry_run) {

+struct tokenize_b_args {

+  VP9_COMP *cpi;

+  MACROBLOCKD *xd;

+  TOKENEXTRA **tp;

+  TX_SIZE tx_size;

+  int dry_run;

+};

+static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,

+                       int ss_txfrm_size, void *arg) {

+  struct tokenize_b_args* const args = arg;

+  VP9_COMP *cpi = args->cpi;

+  MACROBLOCKD *xd = args->xd;

+  TOKENEXTRA **tp = args->tp;

+  PLANE_TYPE type = plane ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;

+  TX_SIZE tx_size = ss_txfrm_size / 2;

+  int dry_run = args->dry_run;

   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

   int pt; /* near block/prev token context index */

-  int c = 0;

-  const int eob = xd->eobs[ib];     /* one beyond last nonzero coeff */

+  int c = 0, rc = 0;

   TOKENEXTRA *t = *tp;        /* store tokens starting here */

-  int16_t *qcoeff_ptr = xd->qcoeff + 16 * ib;

+  const int eob = xd->plane[plane].eobs[block];

+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);

+  const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?

+                                   BLOCK_SIZE_SB8X8 : mbmi->sb_type;

+  const int bwl = b_width_log2(sb_type);

+  const int off = block >> (2 * tx_size);

+  const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;

+  const int aoff = (off & ((1 << mod) - 1)) << tx_size;

+  const int loff = (off >> mod) << tx_size;

+  ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;

+  ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;

   int seg_eob, default_eob, pad;

   const int segment_id = mbmi->segment_id;

-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;

   const int *scan, *nb;

   vp9_coeff_count *counts;

-  vp9_coeff_probs *probs;

-  const int ref = mbmi->ref_frame != INTRA_FRAME;

-  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;

+  vp9_coeff_probs_model *coef_probs;

+  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;

+  ENTROPY_CONTEXT above_ec, left_ec;

   uint8_t token_cache[1024];

-#if CONFIG_CODE_NONZEROCOUNT

-  int zerosleft, nzc = 0;

-  if (eob == 0)

-    assert(xd->nzcs[ib] == 0);

-#endif

+  TX_TYPE tx_type = DCT_DCT;

+  const uint8_t * band_translate;

+  assert((!type && !plane) || (type && plane));

-  if (sb_type == BLOCK_SIZE_SB64X64) {

-    a = (ENTROPY_CONTEXT *)xd->above_context +

-                                             vp9_block2above_sb64[tx_size][ib];

-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];

-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-  } else if (sb_type == BLOCK_SIZE_SB32X32) {

-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];

-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];

-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a2 = a3 = l2 = l3 = NULL;

-  } else {

-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];

-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];

-    a1 = l1 = a2 = l2 = a3 = l3 = NULL;

-  }

+  counts = cpi->coef_counts[tx_size];

+  coef_probs = cpi->common.fc.coef_probs[tx_size];

   switch (tx_size) {

     default:

     case TX_4X4: {

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_4x4(xd, ib) : DCT_DCT;

-      a_ec = *a;

-      l_ec = *l;

+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_4x4(xd, block) : DCT_DCT;

+      above_ec = A[0] != 0;

+      left_ec = L[0] != 0;

       seg_eob = 16;

-      scan = vp9_default_zig_zag1d_4x4;

-      if (tx_type != DCT_DCT) {

-        if (tx_type == ADST_DCT) {

-          scan = vp9_row_scan_4x4;

-        } else if (tx_type == DCT_ADST) {

-          scan = vp9_col_scan_4x4;

-        }

-      }

-      counts = cpi->coef_counts_4x4;

-      probs = cpi->common.fc.coef_probs_4x4;

+      scan = get_scan_4x4(tx_type);

+      band_translate = vp9_coefband_trans_4x4;

       break;

     case TX_8X8: {

-      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

-      a_ec = (a[0] + a[1]) != 0;

-      l_ec = (l[0] + l[1]) != 0;

+      const int sz = 1 + b_width_log2(sb_type);

+      const int x = block & ((1 << sz) - 1), y = block - x;

+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

+      above_ec = (A[0] + A[1]) != 0;

+      left_ec = (L[0] + L[1]) != 0;

       seg_eob = 64;

-      scan = vp9_default_zig_zag1d_8x8;

-      if (tx_type != DCT_DCT) {

-        if (tx_type == ADST_DCT) {

-          scan = vp9_row_scan_8x8;

-        } else if (tx_type == DCT_ADST) {

-          scan = vp9_col_scan_8x8;

-        }

-      }

-      counts = cpi->coef_counts_8x8;

-      probs = cpi->common.fc.coef_probs_8x8;

+      scan = get_scan_8x8(tx_type);

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_16X16: {

-      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

-      if (type != PLANE_TYPE_UV) {

-        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;

-        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;

-      } else {

-        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

-        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

-      }

+      const int sz = 2 + b_width_log2(sb_type);

+      const int x = block & ((1 << sz) - 1), y = block - x;

+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

+      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;

+      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;

       seg_eob = 256;

-      scan = vp9_default_zig_zag1d_16x16;

-      if (tx_type != DCT_DCT) {

-        if (tx_type == ADST_DCT) {

-          scan = vp9_row_scan_16x16;

-        } else if (tx_type == DCT_ADST) {

-          scan = vp9_col_scan_16x16;

-        }

-      }

-      counts = cpi->coef_counts_16x16;

-      probs = cpi->common.fc.coef_probs_16x16;

+      scan = get_scan_16x16(tx_type);

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

     case TX_32X32:

-      if (type != PLANE_TYPE_UV) {

-        a_ec = (a[0] + a[1] + a[2] + a[3] +

-                a1[0] + a1[1] + a1[2] + a1[3]) != 0;

-        l_ec = (l[0] + l[1] + l[2] + l[3] +

-                l1[0] + l1[1] + l1[2] + l1[3]) != 0;

-      } else {

-        a_ec = (a[0] + a[1] + a1[0] + a1[1] +

-                a2[0] + a2[1] + a3[0] + a3[1]) != 0;

-        l_ec = (l[0] + l[1] + l1[0] + l1[1] +

-                l2[0] + l2[1] + l3[0] + l3[1]) != 0;

-      }

+      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;

+      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;

       seg_eob = 1024;

-      scan = vp9_default_zig_zag1d_32x32;

-      counts = cpi->coef_counts_32x32;

-      probs = cpi->common.fc.coef_probs_32x32;

+      scan = vp9_default_scan_32x32;

+      band_translate = vp9_coefband_trans_8x8plus;

       break;

-  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);

+  pt = combine_entropy_contexts(above_ec, left_ec);

   nb = vp9_get_coef_neighbors_handle(scan, &pad);

   default_eob = seg_eob;

@@ -250,220 +191,94 @@

   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))

     seg_eob = 0;

+  c = 0;

   do {

-    const int band = get_coef_band(scan, tx_size, c);

+    const int band = get_coef_band(band_translate, c);

     int token;

     int v = 0;

-#if CONFIG_CODE_NONZEROCOUNT

-    zerosleft = seg_eob - xd->nzcs[ib] - c + nzc;

-#endif

+    rc = scan[c];

+    if (c)

+      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);

     if (c < eob) {

-      const int rc = scan[c];

       v = qcoeff_ptr[rc];

       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);

-      t->Extra = vp9_dct_value_tokens_ptr[v].Extra;

-      token    = vp9_dct_value_tokens_ptr[v].Token;

+      t->extra = vp9_dct_value_tokens_ptr[v].extra;

+      token    = vp9_dct_value_tokens_ptr[v].token;

     } else {

-#if CONFIG_CODE_NONZEROCOUNT

-      break;

-#else

       token = DCT_EOB_TOKEN;

-#endif

-    t->Token = token;

-    t->context_tree = probs[type][ref][band][pt];

-#if CONFIG_CODE_NONZEROCOUNT

-    // Skip zero node if there are no zeros left

-    t->skip_eob_node = 1 + (zerosleft == 0);

+    t->token = token;

+    t->context_tree = coef_probs[type][ref][band][pt];

+    t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);

+#if CONFIG_BALANCED_COEFTREE

+    assert(token <= ZERO_TOKEN ||

+           vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);

 #else

-    t->skip_eob_node = (c > 0) && (token_cache[c - 1] == 0);

+    assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);

 #endif

-    assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);

     if (!dry_run) {

       ++counts[type][ref][band][pt][token];

+#if CONFIG_BALANCED_COEFTREE

+      if (!t->skip_eob_node && token > ZERO_TOKEN)

+#else

       if (!t->skip_eob_node)

+#endif

         ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];

-#if CONFIG_CODE_NONZEROCOUNT

-    nzc += (v != 0);

-#endif

-    token_cache[c] = token;

-    pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);

+    token_cache[scan[c]] = vp9_pt_energy_class[token];

     ++t;

   } while (c < eob && ++c < seg_eob);

-#if CONFIG_CODE_NONZEROCOUNT

-  assert(nzc == xd->nzcs[ib]);

-#endif

   *tp = t;

-  a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */

-  a[0] = a_ec;

-  l[0] = l_ec;

-  if (tx_size == TX_8X8) {

-    a[1] = a_ec;

-    l[1] = l_ec;

-  } else if (tx_size == TX_16X16) {

-    if (type != PLANE_TYPE_UV) {

-      a[1] = a[2] = a[3] = a_ec;

-      l[1] = l[2] = l[3] = l_ec;

-    } else {

-      a1[0] = a1[1] = a[1] = a_ec;

-      l1[0] = l1[1] = l[1] = l_ec;

+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {

+    set_contexts_on_border(xd, bsize, plane, tx_size, c, aoff, loff, A, L);

+  } else {

+    for (pt = 0; pt < (1 << tx_size); pt++) {

+      A[pt] = L[pt] = c > 0;

-  } else if (tx_size == TX_32X32) {

-    if (type != PLANE_TYPE_UV) {

-      a[1] = a[2] = a[3] = a_ec;

-      l[1] = l[2] = l[3] = l_ec;

-      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;

-      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;

-    } else {

-      a[1] = a1[0] = a1[1] = a_ec;

-      l[1] = l1[0] = l1[1] = l_ec;

-      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;

-      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;

-    }

-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 16; i++)

-    skip &= (!xd->eobs[i]);

-  return skip;

+struct is_skippable_args {

+  MACROBLOCKD *xd;

+  int *skippable;

+};

+static void is_skippable(int plane, int block,

+                         BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *argv) {

+  struct is_skippable_args *args = argv;

+  args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);

-int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i;

-  for (i = 16; i < 24; i++)

-    skip &= (!xd->eobs[i]);

-  return skip;

+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {

+  int result = 1;

+  struct is_skippable_args args = {xd, &result};

+  foreach_transformed_block(xd, bsize, is_skippable, &args);

+  return result;

-static int mb_is_skippable_4x4(MACROBLOCKD *xd) {

-  return (vp9_mby_is_skippable_4x4(xd) &

-          vp9_mbuv_is_skippable_4x4(xd));

+int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {

+  int result = 1;

+  struct is_skippable_args args = {xd, &result};

+  foreach_transformed_block_in_plane(xd, bsize, 0,

+                                     is_skippable, &args);

+  return result;

-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 16; i += 4)

-    skip &= (!xd->eobs[i]);

-  return skip;

+int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {

+  int result = 1;

+  struct is_skippable_args args = {xd, &result};

+  foreach_transformed_block_uv(xd, bsize, is_skippable, &args);

+  return result;

-int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {

-  return (!xd->eobs[16]) & (!xd->eobs[20]);

-}

-static int mb_is_skippable_8x8(MACROBLOCKD *xd) {

-  return (vp9_mby_is_skippable_8x8(xd) &

-          vp9_mbuv_is_skippable_8x8(xd));

-}

-static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd) {

-  return (vp9_mby_is_skippable_8x8(xd) &

-          vp9_mbuv_is_skippable_4x4(xd));

-}

-int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {

-  return (!xd->eobs[0]);

-}

-static int mb_is_skippable_16x16(MACROBLOCKD *xd) {

-  return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));

-}

-int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {

-  return (!xd->eobs[0]);

-}

-int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {

-  return (!xd->eobs[64]) & (!xd->eobs[80]);

-}

-static int sb_is_skippable_32x32(MACROBLOCKD *xd) {

-  return vp9_sby_is_skippable_32x32(xd) &&

-         vp9_sbuv_is_skippable_16x16(xd);

-}

-int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 64; i += 16)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-static int sb_is_skippable_16x16(MACROBLOCKD *xd) {

-  return vp9_sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd);

-}

-int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 64; i += 4)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 64; i < 96; i += 4)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-static int sb_is_skippable_8x8(MACROBLOCKD *xd) {

-  return vp9_sby_is_skippable_8x8(xd) & vp9_sbuv_is_skippable_8x8(xd);

-}

-int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 64; i++)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 64; i < 96; i++)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-static int sb_is_skippable_4x4(MACROBLOCKD *xd) {

-  return vp9_sby_is_skippable_4x4(xd) & vp9_sbuv_is_skippable_4x4(xd);

-}

 void vp9_tokenize_sb(VP9_COMP *cpi,

                      MACROBLOCKD *xd,

                      TOKENEXTRA **t,

-                     int dry_run) {

+                     int dry_run, BLOCK_SIZE_TYPE bsize) {

   VP9_COMMON * const cm = &cpi->common;

   MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;

   TOKENEXTRA *t_backup = *t;

@@ -470,32 +285,17 @@

   const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);

   const int segment_id = mbmi->segment_id;

   const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

-  int b;

+  const TX_SIZE txfm_size = mbmi->txfm_size;

+  struct tokenize_b_args arg = {

+    cpi, xd, t, txfm_size, dry_run

+  };

-  switch (mbmi->txfm_size) {

-    case TX_32X32:

-      mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);

-      break;

-    case TX_16X16:

-      mbmi->mb_skip_coeff = sb_is_skippable_16x16(xd);

-      break;

-    case TX_8X8:

-      mbmi->mb_skip_coeff = sb_is_skippable_8x8(xd);

-      break;

-    case TX_4X4:

-      mbmi->mb_skip_coeff = sb_is_skippable_4x4(xd);

-      break;

-    default: assert(0);

-  }

+  mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize);

   if (mbmi->mb_skip_coeff) {

     if (!dry_run)

-      cpi->skip_true_count[mb_skip_context] += skip_inc;

-    if (!cm->mb_no_coeff_skip) {

-      vp9_stuff_sb(cpi, xd, t, dry_run);

-    } else {

-      vp9_reset_sb_tokens_context(xd);

-    }

+      cm->fc.mbskip_count[mb_skip_context][1] += skip_inc;

+    vp9_reset_sb_tokens_context(xd, bsize);

     if (dry_run)

       *t = t_backup;

     return;

@@ -502,335 +302,29 @@

   if (!dry_run)

-    cpi->skip_false_count[mb_skip_context] += skip_inc;

+    cm->fc.mbskip_count[mb_skip_context][0] += skip_inc;

-  switch (mbmi->txfm_size) {

-    case TX_32X32:

-      tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC,

-                 TX_32X32, dry_run);

-      for (b = 64; b < 96; b += 16)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_16X16, dry_run);

-      break;

-    case TX_16X16:

-      for (b = 0; b < 64; b += 16)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

-                   TX_16X16, dry_run);

-      for (b = 64; b < 96; b += 16)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_16X16, dry_run);

-      break;

-    case TX_8X8:

-      for (b = 0; b < 64; b += 4)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

-                   TX_8X8, dry_run);

-      for (b = 64; b < 96; b += 4)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_8X8, dry_run);

-      break;

-    case TX_4X4:

-      for (b = 0; b < 64; b++)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

-                   TX_4X4, dry_run);

-      for (b = 64; b < 96; b++)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_4X4, dry_run);

-      break;

-    default: assert(0);

-  }

+  foreach_transformed_block(xd, bsize, tokenize_b, &arg);

   if (dry_run)

     *t = t_backup;

-int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 256; i += 64)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd) {

-  return (!xd->eobs[256]) & (!xd->eobs[320]);

-}

-static int sb64_is_skippable_32x32(MACROBLOCKD *xd) {

-  return vp9_sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd);

-}

-int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 256; i += 16)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 256; i < 384; i += 16)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-static int sb64_is_skippable_16x16(MACROBLOCKD *xd) {

-  return vp9_sb64y_is_skippable_16x16(xd) & vp9_sb64uv_is_skippable_16x16(xd);

-}

-int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 256; i += 4)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 256; i < 384; i += 4)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-static int sb64_is_skippable_8x8(MACROBLOCKD *xd) {

-  return vp9_sb64y_is_skippable_8x8(xd) & vp9_sb64uv_is_skippable_8x8(xd);

-}

-int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 0; i < 256; i++)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i = 0;

-  for (i = 256; i < 384; i++)

-    skip &= (!xd->eobs[i]);

-  return skip;

-}

-static int sb64_is_skippable_4x4(MACROBLOCKD *xd) {

-  return vp9_sb64y_is_skippable_4x4(xd) & vp9_sb64uv_is_skippable_4x4(xd);

-}

-void vp9_tokenize_sb64(VP9_COMP *cpi,

-                       MACROBLOCKD *xd,

-                       TOKENEXTRA **t,

-                       int dry_run) {

-  VP9_COMMON * const cm = &cpi->common;

-  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;

-  TOKENEXTRA *t_backup = *t;

-  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);

-  const int segment_id = mbmi->segment_id;

-  const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

-  int b;

-  switch (mbmi->txfm_size) {

-    case TX_32X32:

-      mbmi->mb_skip_coeff = sb64_is_skippable_32x32(xd);

-      break;

-    case TX_16X16:

-      mbmi->mb_skip_coeff = sb64_is_skippable_16x16(xd);

-      break;

-    case TX_8X8:

-      mbmi->mb_skip_coeff = sb64_is_skippable_8x8(xd);

-      break;

-    case TX_4X4:

-      mbmi->mb_skip_coeff = sb64_is_skippable_4x4(xd);

-      break;

-    default: assert(0);

-  }

-  if (mbmi->mb_skip_coeff) {

-    if (!dry_run)

-      cpi->skip_true_count[mb_skip_context] += skip_inc;

-    if (!cm->mb_no_coeff_skip) {

-      vp9_stuff_sb64(cpi, xd, t, dry_run);

-    } else {

-      vp9_reset_sb64_tokens_context(xd);

-    }

-    if (dry_run)

-      *t = t_backup;

-    return;

-  }

-  if (!dry_run)

-    cpi->skip_false_count[mb_skip_context] += skip_inc;

-  switch (mbmi->txfm_size) {

-    case TX_32X32:

-      for (b = 0; b < 256; b += 64)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

-                   TX_32X32, dry_run);

-      for (b = 256; b < 384; b += 64)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_32X32, dry_run);

-      break;

-    case TX_16X16:

-      for (b = 0; b < 256; b += 16)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

-                   TX_16X16, dry_run);

-      for (b = 256; b < 384; b += 16)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_16X16, dry_run);

-      break;

-    case TX_8X8:

-      for (b = 0; b < 256; b += 4)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

-                   TX_8X8, dry_run);

-      for (b = 256; b < 384; b += 4)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_8X8, dry_run);

-      break;

-    case TX_4X4:

-      for (b = 0; b < 256; b++)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

-                   TX_4X4, dry_run);

-      for (b = 256; b < 384; b++)

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-                   TX_4X4, dry_run);

-      break;

-    default: assert(0);

-  }

-  if (dry_run)

-    *t = t_backup;

-}

-void vp9_tokenize_mb(VP9_COMP *cpi,

-                     MACROBLOCKD *xd,

-                     TOKENEXTRA **t,

-                     int dry_run) {

-  int b;

-  int tx_size = xd->mode_info_context->mbmi.txfm_size;

-  int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);

-  TOKENEXTRA *t_backup = *t;

-  // If the MB is going to be skipped because of a segment level flag

-  // exclude this from the skip count stats used to calculate the

-  // transmitted skip probability;

-  int skip_inc;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

-    skip_inc = 1;

-  } else

-    skip_inc = 0;

-  switch (tx_size) {

-    case TX_16X16:

-      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);

-      break;

-    case TX_8X8:

-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-          xd->mode_info_context->mbmi.mode == SPLITMV)

-        xd->mode_info_context->mbmi.mb_skip_coeff =

-            mb_is_skippable_8x8_4x4uv(xd);

-      else

-        xd->mode_info_context->mbmi.mb_skip_coeff =

-            mb_is_skippable_8x8(xd);

-      break;

-    default:

-      xd->mode_info_context->mbmi.mb_skip_coeff =

-          mb_is_skippable_4x4(xd);

-      break;

-  }

-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    if (!dry_run)

-      cpi->skip_true_count[mb_skip_context] += skip_inc;

-    if (!cpi->common.mb_no_coeff_skip) {

-      vp9_stuff_mb(cpi, xd, t, dry_run);

-    } else {

-      vp9_reset_mb_tokens_context(xd);

-    }

-    if (dry_run)

-      *t = t_backup;

-    return;

-  }

-  if (!dry_run)

-    cpi->skip_false_count[mb_skip_context] += skip_inc;

-  if (tx_size == TX_16X16) {

-    tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);

-    for (b = 16; b < 24; b += 4) {

-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

-    }

-  } else if (tx_size == TX_8X8) {

-    for (b = 0; b < 16; b += 4) {

-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

-    }

-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-        xd->mode_info_context->mbmi.mode == SPLITMV) {

-      for (b = 16; b < 24; b++) {

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

-      }

-    } else {

-      for (b = 16; b < 24; b += 4) {

-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

-      }

-    }

-  } else {

-    for (b = 0; b < 16; b++)

-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

-    for (b = 16; b < 24; b++)

-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

-  }

-  if (dry_run)

-    *t = t_backup;

-}

 #ifdef ENTROPY_STATS

 void init_context_counters(void) {

   FILE *f = fopen("context.bin", "rb");

   if (!f) {

-    vpx_memset(context_counters_4x4, 0, sizeof(context_counters_4x4));

-    vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));

-    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));

-    vpx_memset(context_counters_32x32, 0, sizeof(context_counters_32x32));

+    vp9_zero(context_counters);

   } else {

-    fread(context_counters_4x4, sizeof(context_counters_4x4), 1, f);

-    fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

-    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

-    fread(context_counters_32x32, sizeof(context_counters_32x32), 1, f);

+    fread(context_counters, sizeof(context_counters), 1, f);

     fclose(f);

   f = fopen("treeupdate.bin", "rb");

   if (!f) {

-    vpx_memset(tree_update_hist_4x4, 0, sizeof(tree_update_hist_4x4));

-    vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));

-    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));

-    vpx_memset(tree_update_hist_32x32, 0, sizeof(tree_update_hist_32x32));

+    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));

   } else {

-    fread(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);

-    fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

-    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

-    fread(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);

+    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);

     fclose(f);

@@ -932,32 +426,29 @@

   fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");

   /* print counts */

-  print_counter(f, context_counters_4x4, BLOCK_TYPES,

+  print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,

                 "vp9_default_coef_counts_4x4[BLOCK_TYPES]");

-  print_counter(f, context_counters_8x8, BLOCK_TYPES,

+  print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,

                 "vp9_default_coef_counts_8x8[BLOCK_TYPES]");

-  print_counter(f, context_counters_16x16, BLOCK_TYPES,

+  print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,

                 "vp9_default_coef_counts_16x16[BLOCK_TYPES]");

-  print_counter(f, context_counters_32x32, BLOCK_TYPES,

+  print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,

                 "vp9_default_coef_counts_32x32[BLOCK_TYPES]");

   /* print coefficient probabilities */

-  print_probs(f, context_counters_4x4, BLOCK_TYPES,

+  print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,

               "default_coef_probs_4x4[BLOCK_TYPES]");

-  print_probs(f, context_counters_8x8, BLOCK_TYPES,

+  print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,

               "default_coef_probs_8x8[BLOCK_TYPES]");

-  print_probs(f, context_counters_16x16, BLOCK_TYPES,

+  print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,

               "default_coef_probs_16x16[BLOCK_TYPES]");

-  print_probs(f, context_counters_32x32, BLOCK_TYPES,

+  print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,

               "default_coef_probs_32x32[BLOCK_TYPES]");

   fclose(f);

   f = fopen("context.bin", "wb");

-  fwrite(context_counters_4x4, sizeof(context_counters_4x4), 1, f);

-  fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

-  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

-  fwrite(context_counters_32x32, sizeof(context_counters_32x32), 1, f);

+  fwrite(context_counters, sizeof(context_counters), 1, f);

   fclose(f);

 #endif

@@ -964,269 +455,4 @@

 void vp9_tokenize_initialize() {

   fill_value_tokens();

-}

-static void stuff_b(VP9_COMP *cpi,

-                    MACROBLOCKD *xd,

-                    const int ib,

-                    TOKENEXTRA **tp,

-                    PLANE_TYPE type,

-                    TX_SIZE tx_size,

-                    int dry_run) {

-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;

-#if CONFIG_CODE_NONZEROCOUNT == 0

-  vp9_coeff_count *counts;

-  vp9_coeff_probs *probs;

-  int pt, band;

-  TOKENEXTRA *t = *tp;

-  const int ref = mbmi->ref_frame != INTRA_FRAME;

-#endif

-  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;

-  if (sb_type == BLOCK_SIZE_SB32X32) {

-    a = (ENTROPY_CONTEXT *)xd->above_context +

-                                             vp9_block2above_sb64[tx_size][ib];

-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];

-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-  } else if (sb_type == BLOCK_SIZE_SB32X32) {

-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];

-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];

-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

-    a2 = l2 = a3 = l3 = NULL;

-  } else {

-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];

-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];

-    a1 = l1 = a2 = l2 = a3 = l3 = NULL;

-  }

-  switch (tx_size) {

-    default:

-    case TX_4X4:

-      a_ec = a[0];

-      l_ec = l[0];

-#if CONFIG_CODE_NONZEROCOUNT == 0

-      counts = cpi->coef_counts_4x4;

-      probs = cpi->common.fc.coef_probs_4x4;

-#endif

-      break;

-    case TX_8X8:

-      a_ec = (a[0] + a[1]) != 0;

-      l_ec = (l[0] + l[1]) != 0;

-#if CONFIG_CODE_NONZEROCOUNT == 0

-      counts = cpi->coef_counts_8x8;

-      probs = cpi->common.fc.coef_probs_8x8;

-#endif

-      break;

-    case TX_16X16:

-      if (type != PLANE_TYPE_UV) {

-        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;

-        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;

-      } else {

-        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

-        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

-      }

-#if CONFIG_CODE_NONZEROCOUNT == 0

-      counts = cpi->coef_counts_16x16;

-      probs = cpi->common.fc.coef_probs_16x16;

-#endif

-      break;

-    case TX_32X32:

-      if (type != PLANE_TYPE_UV) {

-        a_ec = (a[0] + a[1] + a[2] + a[3] +

-                a1[0] + a1[1] + a1[2] + a1[3]) != 0;

-        l_ec = (l[0] + l[1] + l[2] + l[3] +

-                l1[0] + l1[1] + l1[2] + l1[3]) != 0;

-      } else {

-        a_ec = (a[0] + a[1] + a1[0] + a1[1] +

-                a2[0] + a2[1] + a3[0] + a3[1]) != 0;

-        l_ec = (l[0] + l[1] + l1[0] + l1[1] +

-                l2[0] + l2[1] + l3[0] + l3[1]) != 0;

-      }

-#if CONFIG_CODE_NONZEROCOUNT == 0

-      counts = cpi->coef_counts_32x32;

-      probs = cpi->common.fc.coef_probs_32x32;

-#endif

-      break;

-  }

-#if CONFIG_CODE_NONZEROCOUNT == 0

-  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);

-  band = 0;

-  t->Token = DCT_EOB_TOKEN;

-  t->context_tree = probs[type][ref][band][pt];

-  t->skip_eob_node = 0;

-  ++t;

-  *tp = t;

-  if (!dry_run) {

-    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];

-  }

-#endif

-  *a = *l = 0;

-  if (tx_size == TX_8X8) {

-    a[1] = 0;

-    l[1] = 0;

-  } else if (tx_size == TX_16X16) {

-    if (type != PLANE_TYPE_UV) {

-      a[1] = a[2] = a[3] = 0;

-      l[1] = l[2] = l[3] = 0;

-    } else {

-      a1[0] = a1[1] = a[1] = a_ec;

-      l1[0] = l1[1] = l[1] = l_ec;

-    }

-  } else if (tx_size == TX_32X32) {

-    if (type != PLANE_TYPE_Y_WITH_DC) {

-      a[1] = a[2] = a[3] = a_ec;

-      l[1] = l[2] = l[3] = l_ec;

-      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;

-      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;

-    } else {

-      a[1] = a1[0] = a1[1] = a_ec;

-      l[1] = l1[0] = l1[1] = l_ec;

-      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;

-      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;

-    }

-  }

-}

-static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,

-                         TOKENEXTRA **t, int dry_run) {

-  int b;

-  for (b = 0; b < 16; b += 4)

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

-  for (b = 16; b < 24; b += 4)

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

-}

-static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,

-                           TOKENEXTRA **t, int dry_run) {

-  int b;

-  stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);

-  for (b = 16; b < 24; b += 4) {

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

-  }

-}

-static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,

-                         TOKENEXTRA **t, int dry_run) {

-  int b;

-  for (b = 0; b < 16; b++)

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

-  for (b = 16; b < 24; b++)

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

-}

-static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,

-                               TOKENEXTRA **t, int dry_run) {

-  int b;

-  for (b = 0; b < 16; b += 4)

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

-  for (b = 16; b < 24; b++)

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

-}

-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  TOKENEXTRA * const t_backup = *t;

-  if (tx_size == TX_16X16) {

-    stuff_mb_16x16(cpi, xd, t, dry_run);

-  } else if (tx_size == TX_8X8) {

-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-        xd->mode_info_context->mbmi.mode == SPLITMV) {

-      stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);

-    } else {

-      stuff_mb_8x8(cpi, xd, t, dry_run);

-    }

-  } else {

-    stuff_mb_4x4(cpi, xd, t, dry_run);

-  }

-  if (dry_run) {

-    *t = t_backup;

-  }

-}

-void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {

-  TOKENEXTRA * const t_backup = *t;

-  int b;

-  switch (xd->mode_info_context->mbmi.txfm_size) {

-    case TX_32X32:

-      stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);

-      for (b = 64; b < 96; b += 16)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);

-      break;

-    case TX_16X16:

-      for (b = 0; b < 64; b += 16)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);

-      for (b = 64; b < 96; b += 16)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);

-      break;

-    case TX_8X8:

-      for (b = 0; b < 64; b += 4)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

-      for (b = 64; b < 96; b += 4)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

-      break;

-    case TX_4X4:

-      for (b = 0; b < 64; b++)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

-      for (b = 64; b < 96; b++)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

-      break;

-    default: assert(0);

-  }

-  if (dry_run) {

-    *t = t_backup;

-  }

-}

-void vp9_stuff_sb64(VP9_COMP *cpi, MACROBLOCKD *xd,

-                    TOKENEXTRA **t, int dry_run) {

-  TOKENEXTRA * const t_backup = *t;

-  int b;

-  switch (xd->mode_info_context->mbmi.txfm_size) {

-    case TX_32X32:

-      for (b = 0; b < 256; b += 64)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);

-      for (b = 256; b < 384; b += 64)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_32X32, dry_run);

-      break;

-    case TX_16X16:

-      for (b = 0; b < 256; b += 16)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);

-      for (b = 256; b < 384; b += 16)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);

-      break;

-    case TX_8X8:

-      for (b = 0; b < 256; b += 4)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

-      for (b = 256; b < 384; b += 4)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

-      break;

-    case TX_4X4:

-      for (b = 0; b < 256; b++)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

-      for (b = 256; b < 384; b++)

-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

-      break;

-    default: assert(0);

-  }

-  if (dry_run) {

-    *t = t_backup;

-  }

--- a/vp9/encoder/vp9_tokenize.h

+++ b/vp9/encoder/vp9_tokenize.h

@@ -17,14 +17,14 @@

 void vp9_tokenize_initialize();

 typedef struct {

-  int16_t Token;

-  int16_t Extra;

+  int16_t token;

+  int16_t extra;

 } TOKENVALUE;

 typedef struct {

   const vp9_prob *context_tree;

-  int16_t         Extra;

-  uint8_t         Token;

+  int16_t         extra;

+  uint8_t         token;

   uint8_t         skip_eob_node;

 } TOKENEXTRA;

@@ -31,51 +31,19 @@

 typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

                                [MAX_ENTROPY_TOKENS + 1];

-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd);

-int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);

-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd);

-int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);

-int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);

-int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);

-int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd);

-int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd);

-int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd);

-int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);

-int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd);

-int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd);

-int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd);

-int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd);

-int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd);

-int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd);

-int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd);

-int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd);

-int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd);

-int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd);

+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);

+int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);

+int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);

 struct VP9_COMP;

-void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                     TOKENEXTRA **t, int dry_run);

 void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                     TOKENEXTRA **t, int dry_run);

-void vp9_tokenize_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                       TOKENEXTRA **t, int dry_run);

+                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE_TYPE bsize);

-void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                  TOKENEXTRA **t, int dry_run);

-void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                  TOKENEXTRA **t, int dry_run);

-void vp9_stuff_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                    TOKENEXTRA **t, int dry_run);

 #ifdef ENTROPY_STATS

 void init_context_counters();

 void print_context_counters();

-extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];

-extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];

-extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];

-extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];

+extern vp9_coeff_accum context_counters[TX_SIZE_MAX_SB][BLOCK_TYPES];

 #endif

 extern const int *vp9_dct_value_cost_ptr;

--- a/vp9/encoder/vp9_treewriter.c

+++ b/vp9/encoder/vp9_treewriter.c

@@ -8,35 +8,31 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

 #include "vp9/encoder/vp9_treewriter.h"

-#include "vp9/common/vp9_common.h"

-static void cost(

-  int *const C,

-  vp9_tree T,

-  const vp9_prob *const P,

-  int i,

-  int c

-) {

-  const vp9_prob p = P [i >> 1];

+static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,

+                 int i, int c) {

+  const vp9_prob prob = probs[i / 2];

+  int b;

-  do {

-    const vp9_tree_index j = T[i];

-    const int d = c + vp9_cost_bit(p, i & 1);

+  for (b = 0; b <= 1; ++b) {

+    const int cc = c + vp9_cost_bit(prob, b);

+    const vp9_tree_index ii = tree[i + b];

-    if (j <= 0)

-      C[-j] = d;

+    if (ii <= 0)

+      costs[-ii] = cc;

     else

-      cost(C, T, P, j, d);

-  } while (++i & 1);

+      cost(costs, tree, probs, ii, cc);

+  }

-void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {

-  cost(c, t, p, 0, 0);

+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {

+  cost(costs, tree, probs, 0, 0);

-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {

-  assert(t[1] > 0 && t[0] <= 0);

-  c[-t[0]] = vp9_cost_bit(p[0], 0);

-  cost(c, t, p, 2, 0);

+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {

+  assert(tree[0] <= 0 && tree[1] > 0);

+  costs[-tree[0]] = vp9_cost_bit(probs[0], 0);

+  cost(costs, tree, probs, 2, 0);

--- a/vp9/encoder/vp9_treewriter.h

+++ b/vp9/encoder/vp9_treewriter.h

@@ -19,11 +19,8 @@

 #include "vp9/encoder/vp9_boolhuff.h"       /* for now */

-typedef BOOL_CODER vp9_writer;

-#define vp9_write encode_bool

-#define vp9_write_literal vp9_encode_value

-#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)

+#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)

 /* Approximate length of an encoded bool in 256ths of a bit at given prob */

@@ -38,69 +35,53 @@

 /* Both of these return bits, not scaled bits. */

 static INLINE unsigned int cost_branch256(const unsigned int ct[2],

                                           vp9_prob p) {

-  /* Imitate existing calculation */

   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);

 static INLINE unsigned int cost_branch(const unsigned int ct[2],

                                        vp9_prob p) {

-  /* Imitate existing calculation */

   return cost_branch256(ct, p) >> 8;

-/* Small functions to write explicit values and tokens, as well as

-   estimate their lengths. */

-static INLINE void treed_write(vp9_writer *const w,

-                               vp9_tree t,

-                               const vp9_prob *const p,

-                               int v,

-                               /* number of bits in v, assumed nonzero */

-                               int n) {

+static INLINE void treed_write(vp9_writer *w,

+                               vp9_tree tree, const vp9_prob *probs,

+                               int bits, int len) {

   vp9_tree_index i = 0;

   do {

-    const int b = (v >> --n) & 1;

-    vp9_write(w, b, p[i >> 1]);

-    i = t[i + b];

-  } while (n);

+    const int bit = (bits >> --len) & 1;

+    vp9_write(w, bit, probs[i >> 1]);

+    i = tree[i + bit];

+  } while (len);

-static INLINE void write_token(vp9_writer *const w,

-                               vp9_tree t,

-                               const vp9_prob *const p,

-                               vp9_token *const x) {

-  treed_write(w, t, p, x->value, x->Len);

+static INLINE void write_token(vp9_writer *w, vp9_tree tree,

+                               const vp9_prob *probs,

+                               const struct vp9_token *token) {

+  treed_write(w, tree, probs, token->value, token->len);

-static INLINE int treed_cost(vp9_tree t,

-                             const vp9_prob *const p,

-                             int v,

-                             /* number of bits in v, assumed nonzero */

-                             int n) {

-  int c = 0;

+static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,

+                             int bits, int len) {

+  int cost = 0;

   vp9_tree_index i = 0;

   do {

-    const int b = (v >> --n) & 1;

-    c += vp9_cost_bit(p[i >> 1], b);

-    i = t[i + b];

-  } while (n);

+    const int bit = (bits >> --len) & 1;

+    cost += vp9_cost_bit(probs[i >> 1], bit);

+    i = tree[i + bit];

+  } while (len);

-  return c;

+  return cost;

-static INLINE int cost_token(vp9_tree t,

-                             const vp9_prob *const p,

-                             vp9_token *const x) {

-  return treed_cost(t, p, x->value, x->Len);

+static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,

+                             const struct vp9_token *token) {

+  return treed_cost(tree, probs, token->value, token->len);

-/* Fill array of costs for all possible token values. */

-void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);

-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);

+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);

+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);

 #endif  // VP9_ENCODER_VP9_TREEWRITER_H_

--- a/vp9/encoder/vp9_variance.h

+++ b/vp9/encoder/vp9_variance.h

@@ -12,6 +12,7 @@

 #define VP9_ENCODER_VP9_VARIANCE_H_

 #include "vpx/vpx_integer.h"

+// #include "./vpx_config.h"

 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,

                                     int source_stride,

@@ -50,6 +51,15 @@

                                                 int Refstride,

                                                 unsigned int *sse);

+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,

+                                                   int source_stride,

+                                                   int xoffset,

+                                                   int yoffset,

+                                                   const uint8_t *ref_ptr,

+                                                   int Refstride,

+                                                   unsigned int *sse,

+                                                   const uint8_t *second_pred);

 typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,

                                 int rp, unsigned long *sum_s,

                                 unsigned long *sum_r, unsigned long *sum_sq_s,

@@ -64,15 +74,31 @@

                                                    int  ref_stride);

 typedef struct vp9_variance_vtable {

-    vp9_sad_fn_t            sdf;

-    vp9_variance_fn_t       vf;

-    vp9_subpixvariance_fn_t svf;

-    vp9_variance_fn_t       svf_halfpix_h;

-    vp9_variance_fn_t       svf_halfpix_v;

-    vp9_variance_fn_t       svf_halfpix_hv;

-    vp9_sad_multi_fn_t      sdx3f;

-    vp9_sad_multi1_fn_t     sdx8f;

-    vp9_sad_multi_d_fn_t    sdx4df;

+    vp9_sad_fn_t               sdf;

+    vp9_variance_fn_t          vf;

+    vp9_subpixvariance_fn_t    svf;

+    vp9_subp_avg_variance_fn_t svaf;

+    vp9_variance_fn_t          svf_halfpix_h;

+    vp9_variance_fn_t          svf_halfpix_v;

+    vp9_variance_fn_t          svf_halfpix_hv;

+    vp9_sad_multi_fn_t         sdx3f;

+    vp9_sad_multi1_fn_t        sdx8f;

+    vp9_sad_multi_d_fn_t       sdx4df;

 } vp9_variance_fn_ptr_t;

+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,

+                          int height, uint8_t *ref, int ref_stride) {

+  int i, j;

+  for (i = 0; i < height; i++) {

+    for (j = 0; j < weight; j++) {

+      int tmp;

+      tmp = pred[j] + ref[j];

+      comp_pred[j] = (tmp + 1) >> 1;

+    }

+    comp_pred += weight;

+    pred += weight;

+    ref += ref_stride;

+  }

+}

 #endif  // VP9_ENCODER_VP9_VARIANCE_H_

--- a/vp9/encoder/vp9_variance_c.c

+++ b/vp9/encoder/vp9_variance_c.c

@@ -13,6 +13,7 @@

 #include "vp9/common/vp9_filter.h"

 #include "vp9/common/vp9_subpelvar.h"

 #include "vpx/vpx_integer.h"

+#include "vpx_ports/mem.h"

 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {

   unsigned int i, sum = 0;

@@ -24,6 +25,234 @@

   return sum;

+unsigned int vp9_variance64x32_c(const uint8_t *src_ptr,

+                                 int  source_stride,

+                                 const uint8_t *ref_ptr,

+                                 int  recon_stride,

+                                 unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, &var, &avg);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 11));

+}

+unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,

+                                           int  src_pixels_per_line,

+                                           int  xoffset,

+                                           int  yoffset,

+                                           const uint8_t *dst_ptr,

+                                           int dst_pixels_per_line,

+                                           unsigned int *sse) {

+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering

+  uint8_t temp2[68 * 64];

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 33, 64, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);

+  return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,

+                                               int  src_pixels_per_line,

+                                               int  xoffset,

+                                               int  yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse,

+                                               const uint8_t *second_pred) {

+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering

+  uint8_t temp2[68 * 64];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 33, 64, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);

+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);

+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,

+                                 int  source_stride,

+                                 const uint8_t *ref_ptr,

+                                 int  recon_stride,

+                                 unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, &var, &avg);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 11));

+}

+unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,

+                                           int  src_pixels_per_line,

+                                           int  xoffset,

+                                           int  yoffset,

+                                           const uint8_t *dst_ptr,

+                                           int dst_pixels_per_line,

+                                           unsigned int *sse) {

+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering

+  uint8_t temp2[68 * 64];

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 65, 32, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);

+  return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,

+                                               int  src_pixels_per_line,

+                                               int  xoffset,

+                                               int  yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse,

+                                               const uint8_t *second_pred) {

+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering

+  uint8_t temp2[68 * 64];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 65, 32, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);

+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);

+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,

+                                 int  source_stride,

+                                 const uint8_t *ref_ptr,

+                                 int  recon_stride,

+                                 unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, &var, &avg);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 9));

+}

+unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,

+                                           int  src_pixels_per_line,

+                                           int  xoffset,

+                                           int  yoffset,

+                                           const uint8_t *dst_ptr,

+                                           int dst_pixels_per_line,

+                                           unsigned int *sse) {

+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering

+  uint8_t temp2[36 * 32];

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 17, 32, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);

+  return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,

+                                               int  src_pixels_per_line,

+                                               int  xoffset,

+                                               int  yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse,

+                                               const uint8_t *second_pred) {

+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering

+  uint8_t temp2[36 * 32];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 17, 32, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);

+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);

+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,

+                                 int  source_stride,

+                                 const uint8_t *ref_ptr,

+                                 int  recon_stride,

+                                 unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, &var, &avg);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 9));

+}

+unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,

+                                           int  src_pixels_per_line,

+                                           int  xoffset,

+                                           int  yoffset,

+                                           const uint8_t *dst_ptr,

+                                           int dst_pixels_per_line,

+                                           unsigned int *sse) {

+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering

+  uint8_t temp2[36 * 32];

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 33, 16, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);

+  return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,

+                                               int  src_pixels_per_line,

+                                               int  xoffset,

+                                               int  yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse,

+                                               const uint8_t *second_pred) {

+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering

+  uint8_t temp2[36 * 32];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 33, 16, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);

+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);

+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);

+}

 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,

                                  int  source_stride,

                                  const uint8_t *ref_ptr,

@@ -89,6 +318,11 @@

   return (var - (((unsigned int)avg * avg) >> 7));

+void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,

+                       const uint8_t *ref_ptr, int ref_stride,

+                       unsigned int *sse, int *sum) {

+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);

+}

 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,

                                int  source_stride,

@@ -103,6 +337,32 @@

   return (var - (((unsigned int)avg * avg) >> 6));

+unsigned int vp9_variance8x4_c(const uint8_t *src_ptr,

+                               int  source_stride,

+                               const uint8_t *ref_ptr,

+                               int  recon_stride,

+                               unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg);

+  *sse = var;

+  return (var - (((unsigned int)avg * avg) >> 5));

+}

+unsigned int vp9_variance4x8_c(const uint8_t *src_ptr,

+                               int  source_stride,

+                               const uint8_t *ref_ptr,

+                               int  recon_stride,

+                               unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg);

+  *sse = var;

+  return (var - (((unsigned int)avg * avg) >> 5));

+}

 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,

                                int  source_stride,

                                const uint8_t *ref_ptr,

@@ -130,7 +390,46 @@

   return var;

+unsigned int vp9_mse16x8_c(const uint8_t *src_ptr,

+                           int  source_stride,

+                           const uint8_t *ref_ptr,

+                           int  recon_stride,

+                           unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);

+  *sse = var;

+  return var;

+}

+unsigned int vp9_mse8x16_c(const uint8_t *src_ptr,

+                           int  source_stride,

+                           const uint8_t *ref_ptr,

+                           int  recon_stride,

+                           unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);

+  *sse = var;

+  return var;

+}

+unsigned int vp9_mse8x8_c(const uint8_t *src_ptr,

+                          int  source_stride,

+                          const uint8_t *ref_ptr,

+                          int  recon_stride,

+                          unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);

+  *sse = var;

+  return var;

+}

 unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,

                                          int  src_pixels_per_line,

                                          int  xoffset,

@@ -139,22 +438,48 @@

                                          int dst_pixels_per_line,

                                          unsigned int *sse) {

   uint8_t temp2[20 * 16];

-  const int16_t *HFilter, *VFilter;

-  uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering

+  const int16_t *hfilter, *vfilter;

+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   // First filter 1d Horizontal

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 5, 4, hfilter);

   // Now filter Verticaly

-  var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);

   return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);

+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,

+                                             int  src_pixels_per_line,

+                                             int  xoffset,

+                                             int  yoffset,

+                                             const uint8_t *dst_ptr,

+                                             int dst_pixels_per_line,

+                                             unsigned int *sse,

+                                             const uint8_t *second_pred) {

+  uint8_t temp2[20 * 16];

+  const int16_t *hfilter, *vfilter;

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer

+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  // First filter 1d Horizontal

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 5, 4, hfilter);

+  // Now filter Verticaly

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);

+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);

+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);

+}

 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,

                                          int  src_pixels_per_line,

                                          int  xoffset,

@@ -162,19 +487,43 @@

                                          const uint8_t *dst_ptr,

                                          int dst_pixels_per_line,

                                          unsigned int *sse) {

-  uint16_t FData3[9 * 8];  // Temp data bufffer used in filtering

+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering

   uint8_t temp2[20 * 16];

-  const int16_t *HFilter, *VFilter;

+  const int16_t *hfilter, *vfilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 9, 8, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);

   return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);

+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,

+                                             int  src_pixels_per_line,

+                                             int  xoffset,

+                                             int  yoffset,

+                                             const uint8_t *dst_ptr,

+                                             int dst_pixels_per_line,

+                                             unsigned int *sse,

+                                             const uint8_t *second_pred) {

+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering

+  uint8_t temp2[20 * 16];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 9, 8, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);

+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);

+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);

+}

 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,

                                            int  src_pixels_per_line,

                                            int  xoffset,

@@ -182,19 +531,44 @@

                                            const uint8_t *dst_ptr,

                                            int dst_pixels_per_line,

                                            unsigned int *sse) {

-  uint16_t FData3[17 * 16];  // Temp data bufffer used in filtering

+  uint16_t fdata3[17 * 16];  // Temp data bufffer used in filtering

   uint8_t temp2[20 * 16];

-  const int16_t *HFilter, *VFilter;

+  const int16_t *hfilter, *vfilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 17, 16, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);

   return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,

+                                               int  src_pixels_per_line,

+                                               int  xoffset,

+                                               int  yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse,

+                                               const uint8_t *second_pred) {

+  uint16_t fdata3[17 * 16];

+  uint8_t temp2[20 * 16];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 17, 16, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);

+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);

+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);

+}

 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,

                                            int  src_pixels_per_line,

                                            int  xoffset,

@@ -202,20 +576,43 @@

                                            const uint8_t *dst_ptr,

                                            int dst_pixels_per_line,

                                            unsigned int *sse) {

-  uint16_t FData3[65 * 64];  // Temp data bufffer used in filtering

+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering

   uint8_t temp2[68 * 64];

-  const int16_t *HFilter, *VFilter;

+  const int16_t *hfilter, *vfilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

-                                    1, 65, 64, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 64, 64, 64, 64, VFilter);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 65, 64, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);

   return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);

+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,

+                                               int  src_pixels_per_line,

+                                               int  xoffset,

+                                               int  yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse,

+                                               const uint8_t *second_pred) {

+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering

+  uint8_t temp2[68 * 64];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 65, 64, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);

+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);

+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);

+}

 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,

                                            int  src_pixels_per_line,

                                            int  xoffset,

@@ -223,19 +620,43 @@

                                            const uint8_t *dst_ptr,

                                            int dst_pixels_per_line,

                                            unsigned int *sse) {

-  uint16_t FData3[33 * 32];  // Temp data bufffer used in filtering

+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering

   uint8_t temp2[36 * 32];

-  const int16_t *HFilter, *VFilter;

+  const int16_t *hfilter, *vfilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 33, 32, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);

   return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);

+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,

+                                               int  src_pixels_per_line,

+                                               int  xoffset,

+                                               int  yoffset,

+                                               const uint8_t *dst_ptr,

+                                               int dst_pixels_per_line,

+                                               unsigned int *sse,

+                                               const uint8_t *second_pred) {

+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering

+  uint8_t temp2[36 * 32];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 33, 32, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);

+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);

+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);

+}

 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,

                                               int  source_stride,

                                               const uint8_t *ref_ptr,

@@ -363,19 +784,43 @@

                                           const uint8_t *dst_ptr,

                                           int dst_pixels_per_line,

                                           unsigned int *sse) {

-  uint16_t FData3[16 * 9];  // Temp data bufffer used in filtering

+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering

   uint8_t temp2[20 * 16];

-  const int16_t *HFilter, *VFilter;

+  const int16_t *hfilter, *vfilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 9, 16, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);

   return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,

+                                              int  src_pixels_per_line,

+                                              int  xoffset,

+                                              int  yoffset,

+                                              const uint8_t *dst_ptr,

+                                              int dst_pixels_per_line,

+                                              unsigned int *sse,

+                                              const uint8_t *second_pred) {

+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering

+  uint8_t temp2[20 * 16];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 9, 16, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);

+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);

+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);

+}

 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,

                                           int  src_pixels_per_line,

                                           int  xoffset,

@@ -383,17 +828,129 @@

                                           const uint8_t *dst_ptr,

                                           int dst_pixels_per_line,

                                           unsigned int *sse) {

-  uint16_t FData3[9 * 16];  // Temp data bufffer used in filtering

+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering

   uint8_t temp2[20 * 16];

-  const int16_t *HFilter, *VFilter;

+  const int16_t *hfilter, *vfilter;

-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

-                                    1, 17, 8, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 17, 8, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);

   return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);

+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,

+                                              int  src_pixels_per_line,

+                                              int  xoffset,

+                                              int  yoffset,

+                                              const uint8_t *dst_ptr,

+                                              int dst_pixels_per_line,

+                                              unsigned int *sse,

+                                              const uint8_t *second_pred) {

+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering

+  uint8_t temp2[20 * 16];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 17, 8, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);

+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);

+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,

+                                         int  src_pixels_per_line,

+                                         int  xoffset,

+                                         int  yoffset,

+                                         const uint8_t *dst_ptr,

+                                         int dst_pixels_per_line,

+                                         unsigned int *sse) {

+  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering

+  uint8_t temp2[20 * 16];

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 5, 8, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);

+  return vp9_variance8x4_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,

+                                             int  src_pixels_per_line,

+                                             int  xoffset,

+                                             int  yoffset,

+                                             const uint8_t *dst_ptr,

+                                             int dst_pixels_per_line,

+                                             unsigned int *sse,

+                                             const uint8_t *second_pred) {

+  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering

+  uint8_t temp2[20 * 16];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 5, 8, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);

+  comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);

+  return vp9_variance8x4_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,

+                                         int  src_pixels_per_line,

+                                         int  xoffset,

+                                         int  yoffset,

+                                         const uint8_t *dst_ptr,

+                                         int dst_pixels_per_line,

+                                         unsigned int *sse) {

+  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering

+  // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be

+  // of this big? same issue appears in all other block size settings.

+  uint8_t temp2[20 * 16];

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 9, 4, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);

+  return vp9_variance4x8_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,

+                                             int  src_pixels_per_line,

+                                             int  xoffset,

+                                             int  yoffset,

+                                             const uint8_t *dst_ptr,

+                                             int dst_pixels_per_line,

+                                             unsigned int *sse,

+                                             const uint8_t *second_pred) {

+  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering

+  uint8_t temp2[20 * 16];

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8);  // compound pred buffer

+  const int16_t *hfilter, *vfilter;

+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,

+                                    1, 9, 4, hfilter);

+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);

+  comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);

+  return vp9_variance4x8_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);

+}

--- /dev/null

+++ b/vp9/encoder/vp9_write_bit_buffer.h

@@ -1,0 +1,48 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_BIT_WRITE_BUFFER_H_

+#define VP9_BIT_WRITE_BUFFER_H_

+#include <limits.h>

+#include "vpx/vpx_integer.h"

+struct vp9_write_bit_buffer {

+  uint8_t *bit_buffer;

+  size_t bit_offset;

+};

+static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {

+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);

+}

+static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {

+  const int off = wb->bit_offset;

+  const int p = off / CHAR_BIT;

+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;

+  if (q == CHAR_BIT -1) {

+    wb->bit_buffer[p] = bit << q;

+  } else {

+    wb->bit_buffer[p] &= ~(1 << q);

+    wb->bit_buffer[p] |= bit << q;

+  }

+  wb->bit_offset = off + 1;

+}

+static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb,

+                              int data, int bits) {

+  int bit;

+  for (bit = bits - 1; bit >= 0; bit--)

+    vp9_wb_write_bit(wb, (data >> bit) & 1);

+}

+#endif  // VP9_BIT_WRITE_BUFFER_H_

--- a/vp9/encoder/x86/vp9_encodeopt.asm

+++ b/vp9/encoder/x86/vp9_encodeopt.asm

@@ -123,254 +123,3 @@

     UNSHADOW_ARGS

     pop         rbp

ret

-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);

-global sym(vp9_mbblock_error_mmx_impl) PRIVATE

-sym(vp9_mbblock_error_mmx_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    push rsi

-    push rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;coeff_ptr

-        pxor        mm7,        mm7

-        mov         rdi,        arg(1) ;dcoef_ptr

-        pxor        mm2,        mm2

-        mov         rcx,        16

-.mberror_loop_mmx:

-        movq        mm3,       [rsi]

-        movq        mm4,       [rdi]

-        movq        mm5,       [rsi+8]

-        movq        mm6,       [rdi+8]

-        psubw       mm5,        mm6

-        pmaddwd     mm5,        mm5

-        psubw       mm3,        mm4

-        pmaddwd     mm3,        mm3

-        paddd       mm2,        mm5

-        paddd       mm2,        mm3

-        movq        mm3,       [rsi+16]

-        movq        mm4,       [rdi+16]

-        movq        mm5,       [rsi+24]

-        movq        mm6,       [rdi+24]

-        psubw       mm5,        mm6

-        pmaddwd     mm5,        mm5

-        psubw       mm3,        mm4

-        pmaddwd     mm3,        mm3

-        paddd       mm2,        mm5

-        paddd       mm2,        mm3

-        add         rsi,        32

-        add         rdi,        32

-        sub         rcx,        1

-        jnz         .mberror_loop_mmx

-        movq        mm0,        mm2

-        psrlq       mm2,        32

-        paddd       mm0,        mm2

-        movq        rax,        mm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);

-global sym(vp9_mbblock_error_xmm_impl) PRIVATE

-sym(vp9_mbblock_error_xmm_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 5

-    push rsi

-    push rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;coeff_ptr

-        pxor        xmm5,       xmm5

-        mov         rdi,        arg(1) ;dcoef_ptr

-        pxor        xmm4,       xmm4

-        mov         rcx,        16

-.mberror_loop:

-        movdqa      xmm0,       [rsi]

-        movdqa      xmm1,       [rdi]

-        movdqa      xmm2,       [rsi+16]

-        movdqa      xmm3,       [rdi+16]

-        psubw       xmm2,       xmm3

-        pmaddwd     xmm2,       xmm2

-        psubw       xmm0,       xmm1

-        pmaddwd     xmm0,       xmm0

-        add         rsi,        32

-        add         rdi,        32

-        sub         rcx,        1

-        paddd       xmm4,       xmm2

-        paddd       xmm4,       xmm0

-        jnz         .mberror_loop

-        movdqa      xmm0,       xmm4

-        punpckldq   xmm0,       xmm5

-        punpckhdq   xmm4,       xmm5

-        paddd       xmm0,       xmm4

-        movdqa      xmm1,       xmm0

-        psrldq      xmm0,       8

-        paddd       xmm0,       xmm1

-        movq        rax,        xmm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

-global sym(vp9_mbuverror_mmx_impl) PRIVATE

-sym(vp9_mbuverror_mmx_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;s_ptr

-        mov             rdi,        arg(1) ;d_ptr

-        mov             rcx,        16

-        pxor            mm7,        mm7

-.mbuverror_loop_mmx:

-        movq            mm1,        [rsi]

-        movq            mm2,        [rdi]

-        psubw           mm1,        mm2

-        pmaddwd         mm1,        mm1

-        movq            mm3,        [rsi+8]

-        movq            mm4,        [rdi+8]

-        psubw           mm3,        mm4

-        pmaddwd         mm3,        mm3

-        paddd           mm7,        mm1

-        paddd           mm7,        mm3

-        add             rsi,        16

-        add             rdi,        16

-        dec             rcx

-        jnz             .mbuverror_loop_mmx

-        movq            mm0,        mm7

-        psrlq           mm7,        32

-        paddd           mm0,        mm7

-        movq            rax,        mm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

-global sym(vp9_mbuverror_xmm_impl) PRIVATE

-sym(vp9_mbuverror_xmm_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;s_ptr

-        mov             rdi,        arg(1) ;d_ptr

-        mov             rcx,        16

-        pxor            xmm3,       xmm3

-.mbuverror_loop:

-        movdqa          xmm1,       [rsi]

-        movdqa          xmm2,       [rdi]

-        psubw           xmm1,       xmm2

-        pmaddwd         xmm1,       xmm1

-        paddd           xmm3,       xmm1

-        add             rsi,        16

-        add             rdi,        16

-        dec             rcx

-        jnz             .mbuverror_loop

-        pxor        xmm0,           xmm0

-        movdqa      xmm1,           xmm3

-        movdqa      xmm2,           xmm1

-        punpckldq   xmm1,           xmm0

-        punpckhdq   xmm2,           xmm0

-        paddd       xmm1,           xmm2

-        movdqa      xmm2,           xmm1

-        psrldq      xmm1,           8

-        paddd       xmm1,           xmm2

-        movq            rax,            xmm1

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp9/encoder/x86/vp9_quantize_mmx.asm

+++ /dev/null

@@ -1,286 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,

-;                           short *qcoeff_ptr,short *dequant_ptr,

-;                           short *scan_mask, short *round_ptr,

-;                           short *quant_ptr, short *dqcoeff_ptr);

-global sym(vp9_fast_quantize_b_impl_mmx) PRIVATE

-sym(vp9_fast_quantize_b_impl_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;coeff_ptr

-        movq            mm0,        [rsi]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm1,        [rax]

-        movq            mm3,        mm0

-        psraw           mm0,        15

-        pxor            mm3,        mm0

-        psubw           mm3,        mm0         ; abs

-        movq            mm2,        mm3

-        pcmpgtw         mm1,        mm2

-        pandn           mm1,        mm2

-        movq            mm3,        mm1

-        mov             rdx,        arg(6) ;quant_ptr

-        movq            mm1,        [rdx]

-        mov             rcx,        arg(5) ;round_ptr

-        movq            mm2,        [rcx]

-        paddw           mm3,        mm2

-        pmulhuw         mm3,        mm1

-        pxor            mm3,        mm0

-        psubw           mm3,        mm0     ;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm0,        mm3

-        movq            [rdi],      mm3

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm2,        [rax]

-        pmullw          mm3,        mm2

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax],      mm3

-        ; next 8

-        movq            mm4,        [rsi+8]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+8]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+8]

-        movq            mm6,        [rcx+8]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+8],    mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+8]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+8],    mm7

-                ; next 8

-        movq            mm4,        [rsi+16]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+16]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+16]

-        movq            mm6,        [rcx+16]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+16],   mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+16]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+16],   mm7

-                ; next 8

-        movq            mm4,        [rsi+24]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+24]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+24]

-        movq            mm6,        [rcx+24]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+24],   mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+24]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+24],   mm7

-        mov             rdi,        arg(4) ;scan_mask

-        mov             rsi,        arg(2) ;qcoeff_ptr

-        pxor            mm5,        mm5

-        pxor            mm7,        mm7

-        movq            mm0,        [rsi]

-        movq            mm1,        [rsi+8]

-        movq            mm2,        [rdi]

-        movq            mm3,        [rdi+8];

-        pcmpeqw         mm0,        mm7

-        pcmpeqw         mm1,        mm7

-        pcmpeqw         mm6,        mm6

-        pxor            mm0,        mm6

-        pxor            mm1,        mm6

-        psrlw           mm0,        15

-        psrlw           mm1,        15

-        pmaddwd         mm0,        mm2

-        pmaddwd         mm1,        mm3

-        movq            mm5,        mm0

-        paddd           mm5,        mm1

-        movq            mm0,        [rsi+16]

-        movq            mm1,        [rsi+24]

-        movq            mm2,        [rdi+16]

-        movq            mm3,        [rdi+24];

-        pcmpeqw         mm0,        mm7

-        pcmpeqw         mm1,        mm7

-        pcmpeqw         mm6,        mm6

-        pxor            mm0,        mm6

-        pxor            mm1,        mm6

-        psrlw           mm0,        15

-        psrlw           mm1,        15

-        pmaddwd         mm0,        mm2

-        pmaddwd         mm1,        mm3

-        paddd           mm5,        mm0

-        paddd           mm5,        mm1

-        movq            mm0,        mm5

-        psrlq           mm5,        32

-        paddd           mm0,        mm5

-        ; eob adjustment begins here

-        movq            rcx,        mm0

-        and             rcx,        0xffff

-        xor             rdx,        rdx

-        sub             rdx,        rcx ; rdx=-rcx

-        bsr             rax,        rcx

-        inc             rax

-        sar             rdx,        31

-        and             rax,        rdx

-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The

-        ; following is kept as reference

-        ;    movq            rcx,        mm0

-        ;    bsr             rax,        rcx

-        ;

-        ;    mov             eob,        rax

-        ;    mov             eee,        rcx

-        ;

-        ;if(eee==0)

-        ;{

-        ;    eob=-1;

-        ;}

-        ;else if(eee<0)

-        ;{

-        ;    eob=15;

-        ;}

-        ;d->eob = eob+1;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp9/encoder/x86/vp9_quantize_sse2.asm

+++ /dev/null

@@ -1,380 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%include "vp9_asm_enc_offsets.asm"

-; void vp9_regular_quantize_b_sse2 | arg

-;  (BLOCK  *b,                     |  0

-;   BLOCKD *d)                     |  1

-global sym(vp9_regular_quantize_b_sse2) PRIVATE

-sym(vp9_regular_quantize_b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SAVE_XMM 7

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %if LIBVPX_YASM_WIN64

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ALIGN_STACK 16, rax

-    %define zrun_zbin_boost   0  ;  8

-    %define abs_minus_zbin    8  ; 32

-    %define temp_qcoeff       40 ; 32

-    %define qcoeff            72 ; 32

-    %define stack_size        104

-    sub         rsp, stack_size

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr

-    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr

-    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value

-    ; z

-    movdqa      xmm0, [rdx]

-    movdqa      xmm4, [rdx + 16]

-    mov         rdx, [rdi + vp9_block_round] ; round_ptr

-    pshuflw     xmm7, xmm7, 0

-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    ; (z ^ sz)

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    ; x = abs(z)

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    movdqa      xmm2, [rcx]

-    movdqa      xmm3, [rcx + 16]

-    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr

-    ; *zbin_ptr + zbin_oq_value

-    paddw       xmm2, xmm7

-    paddw       xmm3, xmm7

-    ; x - (*zbin_ptr + zbin_oq_value)

-    psubw       xmm1, xmm2

-    psubw       xmm5, xmm3

-    movdqa      [rsp + abs_minus_zbin], xmm1

-    movdqa      [rsp + abs_minus_zbin + 16], xmm5

-    ; add (zbin_ptr + zbin_oq_value) back

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm3

-    movdqa      xmm2, [rdx]

-    movdqa      xmm6, [rdx + 16]

-    movdqa      xmm3, [rcx]

-    movdqa      xmm7, [rcx + 16]

-    ; x + round

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm6

-    ; y = x * quant_ptr >> 16

-    pmulhw      xmm3, xmm1

-    pmulhw      xmm7, xmm5

-    ; y += x

-    paddw       xmm1, xmm3

-    paddw       xmm5, xmm7

-    movdqa      [rsp + temp_qcoeff], xmm1

-    movdqa      [rsp + temp_qcoeff + 16], xmm5

-    pxor        xmm6, xmm6

-    ; zero qcoeff

-    movdqa      [rsp + qcoeff], xmm6

-    movdqa      [rsp + qcoeff + 16], xmm6

-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr

-    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr

-    mov         [rsp + zrun_zbin_boost], rdx

-%macro ZIGZAG_LOOP 1

-    ; x

-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

-    ; if (x >= zbin)

-    sub         cx, WORD PTR[rdx]           ; x - zbin

-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

-    jl          .rq_zigzag_loop_%1           ; x < zbin

-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

-    ; downshift by quant_shift[rc]

-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]

-    sar         edi, cl                     ; also sets Z bit

-    je          .rq_zigzag_loop_%1           ; !y

-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

-.rq_zigzag_loop_%1:

-%endmacro

-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c

-ZIGZAG_LOOP  0

-ZIGZAG_LOOP  1

-ZIGZAG_LOOP  4

-ZIGZAG_LOOP  8

-ZIGZAG_LOOP  5

-ZIGZAG_LOOP  2

-ZIGZAG_LOOP  3

-ZIGZAG_LOOP  6

-ZIGZAG_LOOP  9

-ZIGZAG_LOOP 12

-ZIGZAG_LOOP 13

-ZIGZAG_LOOP 10

-ZIGZAG_LOOP  7

-ZIGZAG_LOOP 11

-ZIGZAG_LOOP 14

-ZIGZAG_LOOP 15

-    movdqa      xmm2, [rsp + qcoeff]

-    movdqa      xmm3, [rsp + qcoeff + 16]

-    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr

-    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr

-    ; y ^ sz

-    pxor        xmm2, xmm0

-    pxor        xmm3, xmm4

-    ; x = (y ^ sz) - sz

-    psubw       xmm2, xmm0

-    psubw       xmm3, xmm4

-    ; dequant

-    movdqa      xmm0, [rcx]

-    movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr

-    pmullw      xmm0, xmm2

-    pmullw      xmm1, xmm3

-    movdqa      [rcx], xmm2        ; store qcoeff

-    movdqa      [rcx + 16], xmm3

-    movdqa      [rdi], xmm0        ; store dqcoeff

-    movdqa      [rdi + 16], xmm1

-    ; select the last value (in zig_zag order) for EOB

-    pcmpeqw     xmm2, xmm6

-    pcmpeqw     xmm3, xmm6

-    ; !

-    pcmpeqw     xmm6, xmm6

-    pxor        xmm2, xmm6

-    pxor        xmm3, xmm6

-    ; mask inv_zig_zag

-    pand        xmm2, [GLOBAL(inv_zig_zag)]

-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]

-    ; select the max value

-    pmaxsw      xmm2, xmm3

-    pshufd      xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00000001b

-    pmaxsw      xmm2, xmm3

-    movd        eax, xmm2

-    and         eax, 0xff

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-    add         rsp, stack_size

-    pop         rsp

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    RESTORE_XMM

-    pop         rbp

-    ret

-; void vp9_fast_quantize_b_sse2 | arg

-;  (BLOCK  *b,                  |  0

-;   BLOCKD *d)                  |  1

-global sym(vp9_fast_quantize_b_sse2) PRIVATE

-sym(vp9_fast_quantize_b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %if LIBVPX_YASM_WIN64

-    push        rdi

-    push        rsi

-  %else

-    ; these registers are used for passing arguments

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_round]

-    mov         rdx, [rdi + vp9_block_quant_fast]

-    ; z = coeff

-    movdqa      xmm0, [rax]

-    movdqa      xmm4, [rax + 16]

-    ; dup z so we can save sz

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz = z >> 15

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    ; x = abs(z) = (z ^ sz) - sz

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    ; x += round

-    paddw       xmm1, [rcx]

-    paddw       xmm5, [rcx + 16]

-    mov         rax, [rsi + vp9_blockd_qcoeff]

-    mov         rcx, [rsi + vp9_blockd_dequant]

-    mov         rdi, [rsi + vp9_blockd_dqcoeff]

-    ; y = x * quant >> 16

-    pmulhw      xmm1, [rdx]

-    pmulhw      xmm5, [rdx + 16]

-    ; x = (y ^ sz) - sz

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    ; qcoeff = x

-    movdqa      [rax], xmm1

-    movdqa      [rax + 16], xmm5

-    ; x * dequant

-    movdqa      xmm2, xmm1

-    movdqa      xmm3, xmm5

-    pmullw      xmm2, [rcx]

-    pmullw      xmm3, [rcx + 16]

-    ; dqcoeff = x * dequant

-    movdqa      [rdi], xmm2

-    movdqa      [rdi + 16], xmm3

-    pxor        xmm4, xmm4                  ;clear all bits

-    pcmpeqw     xmm1, xmm4

-    pcmpeqw     xmm5, xmm4

-    pcmpeqw     xmm4, xmm4                  ;set all bits

-    pxor        xmm1, xmm4

-    pxor        xmm5, xmm4

-    pand        xmm1, [GLOBAL(inv_zig_zag)]

-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]

-    pmaxsw      xmm1, xmm5

-    ; now down to 8

-    pshufd      xmm5, xmm1, 00001110b

-    pmaxsw      xmm1, xmm5

-    ; only 4 left

-    pshuflw     xmm5, xmm1, 00001110b

-    pmaxsw      xmm1, xmm5

-    ; okay, just 2!

-    pshuflw     xmm5, xmm1, 00000001b

-    pmaxsw      xmm1, xmm5

-    movd        eax, xmm1

-    and         eax, 0xff

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-inv_zig_zag:

-  dw 0x0001, 0x0002, 0x0006, 0x0007

-  dw 0x0003, 0x0005, 0x0008, 0x000d

-  dw 0x0004, 0x0009, 0x000c, 0x000e

-  dw 0x000a, 0x000b, 0x000f, 0x0010

--- a/vp9/encoder/x86/vp9_quantize_sse4.asm

+++ /dev/null

@@ -1,254 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%include "vp9_asm_enc_offsets.asm"

-; void vp9_regular_quantize_b_sse4 | arg

-;  (BLOCK  *b,                     |  0

-;   BLOCKD *d)                     |  1

-global sym(vp9_regular_quantize_b_sse4) PRIVATE

-sym(vp9_regular_quantize_b_sse4):

-%if ABI_IS_32BIT

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-    push        rdi

-    push        rsi

-    ALIGN_STACK 16, rax

-    %define qcoeff      0 ; 32

-    %define stack_size 32

-    sub         rsp, stack_size

-%else

-  %if LIBVPX_YASM_WIN64

-    SAVE_XMM 8, u

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_zbin]

-    mov         rdx, [rdi + vp9_block_round]

-    movd        xmm7, [rdi + vp9_block_zbin_extra]

-    ; z

-    movdqa      xmm0, [rax]

-    movdqa      xmm1, [rax + 16]

-    ; duplicate zbin_oq_value

-    pshuflw     xmm7, xmm7, 0

-    punpcklwd   xmm7, xmm7

-    movdqa      xmm2, xmm0

-    movdqa      xmm3, xmm1

-    ; sz

-    psraw       xmm0, 15

-    psraw       xmm1, 15

-    ; (z ^ sz)

-    pxor        xmm2, xmm0

-    pxor        xmm3, xmm1

-    ; x = abs(z)

-    psubw       xmm2, xmm0

-    psubw       xmm3, xmm1

-    ; zbin

-    movdqa      xmm4, [rcx]

-    movdqa      xmm5, [rcx + 16]

-    ; *zbin_ptr + zbin_oq_value

-    paddw       xmm4, xmm7

-    paddw       xmm5, xmm7

-    movdqa      xmm6, xmm2

-    movdqa      xmm7, xmm3

-    ; x - (*zbin_ptr + zbin_oq_value)

-    psubw       xmm6, xmm4

-    psubw       xmm7, xmm5

-    ; round

-    movdqa      xmm4, [rdx]

-    movdqa      xmm5, [rdx + 16]

-    mov         rax, [rdi + vp9_block_quant_shift]

-    mov         rcx, [rdi + vp9_block_quant]

-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]

-    ; x + round

-    paddw       xmm2, xmm4

-    paddw       xmm3, xmm5

-    ; quant

-    movdqa      xmm4, [rcx]

-    movdqa      xmm5, [rcx + 16]

-    ; y = x * quant_ptr >> 16

-    pmulhw      xmm4, xmm2

-    pmulhw      xmm5, xmm3

-    ; y += x

-    paddw       xmm2, xmm4

-    paddw       xmm3, xmm5

-    pxor        xmm4, xmm4

-%if ABI_IS_32BIT

-    movdqa      [rsp + qcoeff], xmm4

-    movdqa      [rsp + qcoeff + 16], xmm4

-%else

-    pxor        xmm8, xmm8

-%endif

-    ; quant_shift

-    movdqa      xmm5, [rax]

-    ; zrun_zbin_boost

-    mov         rax, rdx

-%macro ZIGZAG_LOOP 5

-    ; x

-    pextrw      ecx, %4, %2

-    ; if (x >= zbin)

-    sub         cx, WORD PTR[rdx]           ; x - zbin

-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

-    jl          .rq_zigzag_loop_%1          ; x < zbin

-    pextrw      edi, %3, %2                 ; y

-    ; downshift by quant_shift[rc]

-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]

-    sar         edi, cl                     ; also sets Z bit

-    je          .rq_zigzag_loop_%1          ; !y

-%if ABI_IS_32BIT

-    mov         WORD PTR[rsp + qcoeff + %1 *2], di

-%else

-    pinsrw      %5, edi, %2                 ; qcoeff[rc]

-%endif

-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost

-.rq_zigzag_loop_%1:

-%endmacro

-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c

-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8

-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8

-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4

-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8

-    mov         rcx, [rsi + vp9_blockd_dequant]

-    mov         rdi, [rsi + vp9_blockd_dqcoeff]

-%if ABI_IS_32BIT

-    movdqa      xmm4, [rsp + qcoeff]

-    movdqa      xmm5, [rsp + qcoeff + 16]

-%else

-    %define     xmm5 xmm8

-%endif

-    ; y ^ sz

-    pxor        xmm4, xmm0

-    pxor        xmm5, xmm1

-    ; x = (y ^ sz) - sz

-    psubw       xmm4, xmm0

-    psubw       xmm5, xmm1

-    ; dequant

-    movdqa      xmm0, [rcx]

-    movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rsi + vp9_blockd_qcoeff]

-    pmullw      xmm0, xmm4

-    pmullw      xmm1, xmm5

-    ; store qcoeff

-    movdqa      [rcx], xmm4

-    movdqa      [rcx + 16], xmm5

-    ; store dqcoeff

-    movdqa      [rdi], xmm0

-    movdqa      [rdi + 16], xmm1

-    ; select the last value (in zig_zag order) for EOB

-    pxor        xmm6, xmm6

-    pcmpeqw     xmm4, xmm6

-    pcmpeqw     xmm5, xmm6

-    packsswb    xmm4, xmm5

-    pshufb      xmm4, [GLOBAL(zig_zag1d)]

-    pmovmskb    edx, xmm4

-    xor         rdi, rdi

-    mov         eax, -1

-    xor         dx, ax

-    bsr         eax, edx

-    sub         edi, edx

-    sar         edi, 31

-    add         eax, 1

-    and         eax, edi

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    add         rsp, stack_size

-    pop         rsp

-    pop         rsi

-    pop         rdi

-    RESTORE_GOT

-    pop         rbp

-%else

-  %undef xmm5

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-SECTION_RODATA

-align 16

-; vp9/common/vp9_entropy.c: vp9_default_zig_zag1d

-zig_zag1d:

-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm

+++ /dev/null

@@ -1,138 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%include "vp9_asm_enc_offsets.asm"

-; void vp9_fast_quantize_b_ssse3 | arg

-;  (BLOCK  *b,                   |  0

-;   BLOCKD *d)                   |  1

-;

-global sym(vp9_fast_quantize_b_ssse3) PRIVATE

-sym(vp9_fast_quantize_b_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %if LIBVPX_YASM_WIN64

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %if LIBVPX_YASM_WIN64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_round]

-    mov         rdx, [rdi + vp9_block_quant_fast]

-    ; coeff

-    movdqa      xmm0, [rax]

-    movdqa      xmm4, [rax + 16]

-    ; round

-    movdqa      xmm2, [rcx]

-    movdqa      xmm3, [rcx + 16]

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz = z >> 15

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    pabsw       xmm1, xmm1

-    pabsw       xmm5, xmm5

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm3

-    ; quant_fast

-    pmulhw      xmm1, [rdx]

-    pmulhw      xmm5, [rdx + 16]

-    mov         rax, [rsi + vp9_blockd_qcoeff]

-    mov         rdi, [rsi + vp9_blockd_dequant]

-    mov         rcx, [rsi + vp9_blockd_dqcoeff]

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    movdqa      [rax], xmm1

-    movdqa      [rax + 16], xmm5

-    movdqa      xmm2, [rdi]

-    movdqa      xmm3, [rdi + 16]

-    pxor        xmm4, xmm4

-    pmullw      xmm2, xmm1

-    pmullw      xmm3, xmm5

-    pcmpeqw     xmm1, xmm4                  ;non zero mask

-    pcmpeqw     xmm5, xmm4                  ;non zero mask

-    packsswb    xmm1, xmm5

-    pshufb      xmm1, [GLOBAL(zz_shuf)]

-    pmovmskb    edx, xmm1

-    xor         rdi, rdi

-    mov         eax, -1

-    xor         dx, ax                      ;flip the bits for bsr

-    bsr         eax, edx

-    movdqa      [rcx], xmm2                 ;store dqcoeff

-    movdqa      [rcx + 16], xmm3            ;store dqcoeff

-    sub         edi, edx                    ;check for all zeros in bit mask

-    sar         edi, 31                     ;0 or -1

-    add         eax, 1

-    and         eax, edi                    ;if the bit mask was all zero,

-                                            ;then eob = 0

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-zz_shuf:

-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- a/vp9/encoder/x86/vp9_quantize_x86.h

+++ /dev/null

@@ -1,48 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license and patent

- *  grant that can be found in the LICENSE file in the root of the source

- *  tree. All contributing project authors may be found in the AUTHORS

- *  file in the root of the source tree.

- */

-#ifndef VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_

-#define VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-#endif /* HAVE_MMX */

-#if HAVE_SSE2

-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp9_quantize_quantb

-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2

-#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif /* HAVE_SSE2 */

-#if HAVE_SSE4_1

-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp9_quantize_quantb

-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4

-#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif /* HAVE_SSE4_1 */

-#endif /* QUANTIZE_X86_H */

--- a/vp9/encoder/x86/vp9_sad4d_sse2.asm

+++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm

@@ -215,7 +215,11 @@

 INIT_XMM sse2

 SADNXN4D 64, 64

+SADNXN4D 64, 32

+SADNXN4D 32, 64

 SADNXN4D 32, 32

+SADNXN4D 32, 16

+SADNXN4D 16, 32

 SADNXN4D 16, 16

 SADNXN4D 16,  8

 SADNXN4D  8, 16

--- a/vp9/encoder/x86/vp9_sad_sse2.asm

+++ b/vp9/encoder/x86/vp9_sad_sse2.asm

@@ -14,11 +14,11 @@

 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,

 ;                                uint8_t *ref, int ref_stride);

-INIT_XMM sse2

-cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows

+%macro SAD64XN 1

+cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows

   movsxdifnidn src_strideq, src_strided

   movsxdifnidn ref_strideq, ref_strided

-  mov              n_rowsd, 64

+  mov              n_rowsd, %1

   pxor                  m0, m0

 .loop:

   movu                  m1, [refq]

@@ -42,14 +42,19 @@

   paddd                 m0, m1

   movd                 eax, m0

RET

+%endmacro

+INIT_XMM sse2

+SAD64XN 64 ; sad64x64_sse2

+SAD64XN 32 ; sad64x32_sse2

 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,

 ;                                uint8_t *ref, int ref_stride);

-INIT_XMM sse2

-cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows

+%macro SAD32XN 1

+cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows

   movsxdifnidn src_strideq, src_strided

   movsxdifnidn ref_strideq, ref_strided

-  mov              n_rowsd, 16

+  mov              n_rowsd, %1/2

   pxor                  m0, m0

 .loop:

@@ -74,7 +79,13 @@

   paddd                 m0, m1

   movd                 eax, m0

RET

+%endmacro

+INIT_XMM sse2

+SAD32XN 64 ; sad32x64_sse2

+SAD32XN 32 ; sad32x32_sse2

+SAD32XN 16 ; sad32x16_sse2

 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,

 ;                                    uint8_t *ref, int ref_stride);

 %macro SAD16XN 1

@@ -112,6 +123,7 @@

 %endmacro

 INIT_XMM sse2

+SAD16XN 32 ; sad16x32_sse2

 SAD16XN 16 ; sad16x16_sse2

 SAD16XN  8 ; sad16x8_sse2

--- a/vp9/encoder/x86/vp9_variance_sse2.c

+++ b/vp9/encoder/x86/vp9_variance_sse2.c

@@ -139,8 +139,38 @@

 DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);

-unsigned int vp9_variance4x4_wmt(

+typedef unsigned int (*get_var_sse2) (

   const unsigned char *src_ptr,

+  int source_stride,

+  const unsigned char *ref_ptr,

+  int recon_stride,

+  unsigned int *SSE,

+  int *Sum

+);

+static void variance_sse2(const unsigned char *src_ptr, int  source_stride,

+                        const unsigned char *ref_ptr, int  recon_stride,

+                        int  w, int  h, unsigned int *sse, int *sum,

+                        get_var_sse2 var_fn, int block_size) {

+  unsigned int sse0;

+  int sum0;

+  int i, j;

+  *sse = 0;

+  *sum = 0;

+  for (i = 0; i < h; i += block_size) {

+    for (j = 0; j < w; j += block_size) {

+      var_fn(src_ptr + source_stride * i + j, source_stride,

+             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);

+      *sse += sse0;

+      *sum += sum0;

+    }

+  }

+}

+unsigned int vp9_variance4x4_sse2(

+  const unsigned char *src_ptr,

   int  source_stride,

   const unsigned char *ref_ptr,

   int  recon_stride,

@@ -148,13 +178,41 @@

   unsigned int var;

   int avg;

-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,

+                  &var, &avg, vp9_get4x4var_mmx, 4);

   *sse = var;

   return (var - (((unsigned int)avg * avg) >> 4));

+}

+unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,

+                                  int  source_stride,

+                                  const uint8_t *ref_ptr,

+                                  int  recon_stride,

+                                  unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,

+                  &var, &avg, vp9_get4x4var_mmx, 4);

+  *sse = var;

+  return (var - (((unsigned int)avg * avg) >> 5));

-unsigned int vp9_variance8x8_wmt

+unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,

+                                  int  source_stride,

+                                  const uint8_t *ref_ptr,

+                                  int  recon_stride,

+                                  unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,

+                  &var, &avg, vp9_get4x4var_mmx, 4);

+  *sse = var;

+  return (var - (((unsigned int)avg * avg) >> 5));

+}

+unsigned int vp9_variance8x8_sse2

   const unsigned char *src_ptr,

   int  source_stride,

@@ -164,14 +222,13 @@

   unsigned int var;

   int avg;

-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,

+                  &var, &avg, vp9_get8x8var_sse2, 8);

   *sse = var;

   return (var - (((unsigned int)avg * avg) >> 6));

-unsigned int vp9_variance16x16_wmt

+unsigned int vp9_variance16x8_sse2

   const unsigned char *src_ptr,

   int  source_stride,

@@ -178,32 +235,32 @@

   const unsigned char *ref_ptr,

   int  recon_stride,

   unsigned int *sse) {

-  unsigned int sse0;

-  int sum0;

+  unsigned int var;

+  int avg;

-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  *sse = sse0;

-  return (sse0 - (((unsigned int)sum0 * sum0) >> 8));

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,

+                  &var, &avg, vp9_get8x8var_sse2, 8);

+  *sse = var;

+  return (var - (((unsigned int)avg * avg) >> 7));

-unsigned int vp9_mse16x16_wmt(

+unsigned int vp9_variance8x16_sse2

+(

   const unsigned char *src_ptr,

   int  source_stride,

   const unsigned char *ref_ptr,

   int  recon_stride,

   unsigned int *sse) {

+  unsigned int var;

+  int avg;

-  unsigned int sse0;

-  int sum0;

-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  *sse = sse0;

-  return sse0;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,

+                &var, &avg, vp9_get8x8var_sse2, 8);

+  *sse = var;

+  return (var - (((unsigned int)avg * avg) >> 7));

-unsigned int vp9_variance16x8_wmt

+unsigned int vp9_variance16x16_sse2

   const unsigned char *src_ptr,

   int  source_stride,

@@ -210,37 +267,112 @@

   const unsigned char *ref_ptr,

   int  recon_stride,

   unsigned int *sse) {

-  unsigned int sse0, sse1, var;

-  int sum0, sum1, avg;

+  unsigned int var;

+  int avg;

-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

-  var = sse0 + sse1;

-  avg = sum0 + sum1;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,

+                &var, &avg, vp9_get16x16var_sse2, 16);

   *sse = var;

-  return (var - (((unsigned int)avg * avg) >> 7));

+  return (var - (((unsigned int)avg * avg) >> 8));

-unsigned int vp9_variance8x16_wmt

-(

+unsigned int vp9_mse16x16_wmt(

   const unsigned char *src_ptr,

   int  source_stride,

   const unsigned char *ref_ptr,

   int  recon_stride,

   unsigned int *sse) {

-  unsigned int sse0, sse1, var;

-  int sum0, sum1, avg;

-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);

+  unsigned int sse0;

+  int sum0;

+  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,

+                       &sum0);

+  *sse = sse0;

+  return sse0;

+}

-  var = sse0 + sse1;

-  avg = sum0 + sum1;

+unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,

+                                    int  source_stride,

+                                    const uint8_t *ref_ptr,

+                                    int  recon_stride,

+                                    unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,

+                &var, &avg, vp9_get16x16var_sse2, 16);

   *sse = var;

-  return (var - (((unsigned int)avg * avg) >> 7));

+  return (var - (((int64_t)avg * avg) >> 10));

+}

+unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,

+                                    int  source_stride,

+                                    const uint8_t *ref_ptr,

+                                    int  recon_stride,

+                                    unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,

+                &var, &avg, vp9_get16x16var_sse2, 16);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 9));

+}

+unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,

+                                    int  source_stride,

+                                    const uint8_t *ref_ptr,

+                                    int  recon_stride,

+                                    unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,

+                &var, &avg, vp9_get16x16var_sse2, 16);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 9));

+}

+unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,

+                                    int  source_stride,

+                                    const uint8_t *ref_ptr,

+                                    int  recon_stride,

+                                    unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,

+                &var, &avg, vp9_get16x16var_sse2, 16);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 12));

+}

+unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,

+                                    int  source_stride,

+                                    const uint8_t *ref_ptr,

+                                    int  recon_stride,

+                                    unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,

+                &var, &avg, vp9_get16x16var_sse2, 16);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 11));

+}

+unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,

+                                    int  source_stride,

+                                    const uint8_t *ref_ptr,

+                                    int  recon_stride,

+                                    unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,

+                &var, &avg, vp9_get16x16var_sse2, 16);

+  *sse = var;

+  return (var - (((int64_t)avg * avg) >> 11));

 unsigned int vp9_sub_pixel_variance4x4_wmt

--- a/vp9/encoder/x86/vp9_variance_ssse3.c

+++ b/vp9/encoder/x86/vp9_variance_ssse3.c

@@ -15,15 +15,6 @@

 #define HALFNDX 8

-extern unsigned int vp9_get16x16var_sse2

-(

-  const unsigned char *src_ptr,

-  int source_stride,

-  const unsigned char *ref_ptr,

-  int recon_stride,

-  unsigned int *SSE,

-  int *Sum

-);

 extern void vp9_half_horiz_vert_variance16x_h_sse2

   const unsigned char *ref_ptr,

--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c

+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c

@@ -17,26 +17,12 @@

 // TODO(jimbankoski) Consider rewriting the c to take the same values rather

 // than going through these pointer conversions

-#if HAVE_MMX

+#if 0 && HAVE_MMX

 void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {

   vp9_short_fdct4x4_mmx(input,   output,    pitch);

   vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);

-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);

-int vp9_mbblock_error_mmx(MACROBLOCK *mb) {

-  short *coeff_ptr =  mb->block[0].coeff;

-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

-  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr);

-}

-int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

-int vp9_mbuverror_mmx(MACROBLOCK *mb) {

-  short *s_ptr = &mb->coeff[256];

-  short *d_ptr = &mb->e_mbd.dqcoeff[256];

-  return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);

-}

 void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,

                              short *diff, unsigned char *predictor,

                              int pitch);

@@ -44,27 +30,15 @@

   unsigned char *z = *(be->base_src) + be->src;

   unsigned int  src_stride = be->src_stride;

   short *diff = &be->src_diff[0];

-  unsigned char *predictor = &bd->predictor[0];

+  unsigned char *predictor = *(bd->base_dst) + bd->dst;

+  // TODO(jingning): The prototype function in c has been changed. Need to

+  // modify the mmx and sse versions.

   vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);

 #endif

-#if HAVE_SSE2

-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);

-int vp9_mbblock_error_xmm(MACROBLOCK *mb) {

-  short *coeff_ptr =  mb->block[0].coeff;

-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

-  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr);

-}

-int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

-int vp9_mbuverror_xmm(MACROBLOCK *mb) {

-  short *s_ptr = &mb->coeff[256];

-  short *d_ptr = &mb->e_mbd.dqcoeff[256];

-  return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);

-}

+#if 0 && HAVE_SSE2

 void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,

                               short *diff, unsigned char *predictor,

                               int pitch);

@@ -72,7 +46,9 @@

   unsigned char *z = *(be->base_src) + be->src;

   unsigned int  src_stride = be->src_stride;

   short *diff = &be->src_diff[0];

-  unsigned char *predictor = &bd->predictor[0];

+  unsigned char *predictor = *(bd->base_dst) + bd->dst;

+  // TODO(jingning): The prototype function in c has been changed. Need to

+  // modify the mmx and sse versions.

   vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -15,7 +15,6 @@

 VP9_COMMON_SRCS-yes += common/vp9_onyx.h

 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c

 VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c

-VP9_COMMON_SRCS-yes += common/vp9_blockd.c

 VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h

 VP9_COMMON_SRCS-yes += common/vp9_convolve.c

 VP9_COMMON_SRCS-yes += common/vp9_convolve.h

@@ -36,9 +35,9 @@

 VP9_COMMON_SRCS-yes += common/vp9_entropy.h

 VP9_COMMON_SRCS-yes += common/vp9_entropymode.h

 VP9_COMMON_SRCS-yes += common/vp9_entropymv.h

+VP9_COMMON_SRCS-yes += common/vp9_enums.h

 VP9_COMMON_SRCS-yes += common/vp9_extend.h

 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h

-VP9_COMMON_SRCS-yes += common/vp9_header.h

 VP9_COMMON_SRCS-yes += common/vp9_idct.h

 VP9_COMMON_SRCS-yes += common/vp9_invtrans.h

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h

@@ -56,8 +55,6 @@

 VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h

 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h

 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c

-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h

-VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h

 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h

 VP9_COMMON_SRCS-yes += common/vp9_textblit.h

 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h

@@ -72,12 +69,8 @@

 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c

 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h

 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c

-VP9_COMMON_SRCS-yes += common/vp9_recon.c

 VP9_COMMON_SRCS-yes += common/vp9_reconinter.c

 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c

-VP9_COMMON_SRCS-yes += common/vp9_reconintra4x4.c

-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.c

-VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.c

 VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c

 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c

 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c

@@ -92,7 +85,6 @@

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm

@@ -113,14 +105,6 @@

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c

-ifeq ($(HAVE_SSE2),yes)

-vp9/common/x86/vp9_idct_intrin_sse2.c.o: CFLAGS += -msse2

-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2

-vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2

-vp9/common/x86/vp9_idct_intrin_sse2.c.d: CFLAGS += -msse2

-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2

-vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2

-endif

 $(eval $(call asm_offsets_template,\

          vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))

--- a/vp9/vp9_cx_iface.c

+++ b/vp9/vp9_cx_iface.c

@@ -20,7 +20,7 @@

 #include <stdlib.h>

 #include <string.h>

-struct vp8_extracfg {

+struct vp9_extracfg {

   struct vpx_codec_pkt_list *pkt_list;

   int                         cpu_used;                    /** available cpu percentage in 1/16*/

   unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */

@@ -42,7 +42,7 @@

 struct extraconfig_map {

   int                 usage;

-  struct vp8_extracfg cfg;

+  struct vp9_extracfg cfg;

};

 static const struct extraconfig_map extracfg_map[] = {

@@ -73,7 +73,7 @@

 struct vpx_codec_alg_priv {

   vpx_codec_priv_t        base;

   vpx_codec_enc_cfg_t     cfg;

-  struct vp8_extracfg     vp8_cfg;

+  struct vp9_extracfg     vp8_cfg;

   VP9_CONFIG              oxcf;

   VP9_PTR             cpi;

   unsigned char          *cx_data;

@@ -131,7 +131,7 @@

 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,

                                        const vpx_codec_enc_cfg_t *cfg,

-                                       const struct vp8_extracfg *vp8_cfg) {

+                                       const struct vp9_extracfg *vp8_cfg) {

   RANGE_CHECK(cfg, g_w,                   1, 65535); /* 16 bits available */

   RANGE_CHECK(cfg, g_h,                   1, 65535); /* 16 bits available */

   RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);

@@ -211,11 +211,12 @@

   switch (img->fmt) {

     case VPX_IMG_FMT_YV12:

     case VPX_IMG_FMT_I420:

-    case VPX_IMG_FMT_VPXI420:

-    case VPX_IMG_FMT_VPXYV12:

+    case VPX_IMG_FMT_I422:

+    case VPX_IMG_FMT_I444:

       break;

     default:

-      ERROR("Invalid image format. Only YV12 and I420 images are supported");

+      ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "

+            "supported.");

   if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))

@@ -225,9 +226,9 @@

-static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,

+static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,

                                        vpx_codec_enc_cfg_t cfg,

-                                       struct vp8_extracfg vp8_cfg) {

+                                       struct vp9_extracfg vp8_cfg) {

   oxcf->version = cfg.g_profile | (vp8_cfg.experimental ? 0x4 : 0);

   oxcf->width   = cfg.g_w;

   oxcf->height  = cfg.g_h;

@@ -350,7 +351,7 @@

   return VPX_CODEC_OK;

-static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,

+static vpx_codec_err_t vp9e_set_config(vpx_codec_alg_priv_t       *ctx,

                                        const vpx_codec_enc_cfg_t  *cfg) {

   vpx_codec_err_t res;

@@ -369,7 +370,7 @@

   if (!res) {

     ctx->cfg = *cfg;

-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

+    set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

     vp9_change_config(ctx->cpi, &ctx->oxcf);

@@ -405,7 +406,7 @@

                                  int                   ctrl_id,

                                  va_list               args) {

   vpx_codec_err_t     res  = VPX_CODEC_OK;

-  struct vp8_extracfg xcfg = ctx->vp8_cfg;

+  struct vp9_extracfg xcfg = ctx->vp8_cfg;

 #define MAP(id, var) case id: var = CAST(id, args); break;

@@ -432,7 +433,7 @@

   if (!res) {

     ctx->vp8_cfg = xcfg;

-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

+    set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

     vp9_change_config(ctx->cpi, &ctx->oxcf);

@@ -441,7 +442,7 @@

-static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,

+static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx,

                                         int              experimental) {

   vpx_codec_err_t            res = VPX_CODEC_OK;

   struct vpx_codec_alg_priv *priv;

@@ -486,7 +487,10 @@

     priv->vp8_cfg.pkt_list = &priv->pkt_list.head;

     priv->vp8_cfg.experimental = experimental;

-    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;

+    // TODO(agrange) Check the limits set on this buffer, or the check that is

+    // applied in vp9e_encode.

+    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8;

+//    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;

     if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;

@@ -501,7 +505,7 @@

     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);

     if (!res) {

-      set_vp8e_config(&ctx->priv->alg_priv->oxcf,

+      set_vp9e_config(&ctx->priv->alg_priv->oxcf,

                       ctx->priv->alg_priv->cfg,

                       ctx->priv->alg_priv->vp8_cfg);

       optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);

@@ -517,21 +521,21 @@

-static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,

+static vpx_codec_err_t vp9e_init(vpx_codec_ctx_t *ctx,

                                  vpx_codec_priv_enc_mr_cfg_t *data) {

-  return vp8e_common_init(ctx, 0);

+  return vp9e_common_init(ctx, 0);

 #if CONFIG_EXPERIMENTAL

-static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx,

+static vpx_codec_err_t vp9e_exp_init(vpx_codec_ctx_t *ctx,

                                      vpx_codec_priv_enc_mr_cfg_t *data) {

-  return vp8e_common_init(ctx, 1);

+  return vp9e_common_init(ctx, 1);

 #endif

-static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {

+static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {

   free(ctx->cx_data);

   vp9_remove_compressor(&ctx->cpi);

@@ -539,28 +543,6 @@

   return VPX_CODEC_OK;

-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,

-                                       YV12_BUFFER_CONFIG  *yv12) {

-  vpx_codec_err_t        res = VPX_CODEC_OK;

-  yv12->y_buffer = img->planes[VPX_PLANE_Y];

-  yv12->u_buffer = img->planes[VPX_PLANE_U];

-  yv12->v_buffer = img->planes[VPX_PLANE_V];

-  yv12->y_crop_width  = img->d_w;

-  yv12->y_crop_height = img->d_h;

-  yv12->y_width  = img->d_w;

-  yv12->y_height = img->d_h;

-  yv12->uv_width = (1 + yv12->y_width) / 2;

-  yv12->uv_height = (1 + yv12->y_height) / 2;

-  yv12->y_stride = img->stride[VPX_PLANE_Y];

-  yv12->uv_stride = img->stride[VPX_PLANE_U];

-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;

-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0

-  return res;

-}

 static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,

                                     unsigned long          duration,

                                     unsigned long          deadline) {

@@ -626,7 +608,7 @@

   return index_sz;

-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,

+static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,

                                    const vpx_image_t     *img,

                                    vpx_codec_pts_t        pts,

                                    unsigned long          duration,

@@ -754,7 +736,7 @@

         vpx_codec_cx_pkt_t pkt;

         VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;

-        /* Pack invisible frames with the next visisble frame */

+        /* Pack invisible frames with the next visible frame */

         if (!cpi->common.show_frame) {

           if (!ctx->pending_cx_data)

             ctx->pending_cx_data = cx_data;

@@ -849,12 +831,12 @@

-static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,

+static const vpx_codec_cx_pkt_t *vp9e_get_cxdata(vpx_codec_alg_priv_t  *ctx,

                                                  vpx_codec_iter_t      *iter) {

   return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);

-static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,

                                           int ctr_id,

                                           va_list args) {

   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

@@ -871,7 +853,7 @@

-static vpx_codec_err_t vp8e_copy_reference(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,

                                            int ctr_id,

                                            va_list args) {

@@ -904,7 +886,7 @@

-static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,

                                           int ctr_id,

                                           va_list args) {

 #if CONFIG_POSTPROC

@@ -925,7 +907,7 @@

-static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {

+static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {

   YV12_BUFFER_CONFIG sd;

   vp9_ppflags_t flags = {0};

@@ -937,45 +919,13 @@

   if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {

-    /*

-    vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,

-        sd.y_width + 2*VP9BORDERINPIXELS,

-        sd.y_height + 2*VP9BORDERINPIXELS,

-        1,

-        sd.buffer_alloc);

-    vpx_img_set_rect(&ctx->preview_img,

-        VP9BORDERINPIXELS, VP9BORDERINPIXELS,

-        sd.y_width, sd.y_height);

-        */

-    ctx->preview_img.bps = 12;

-    ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;

-    ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;

-    ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;

-    if (sd.clrtype == REG_YUV)

-      ctx->preview_img.fmt = VPX_IMG_FMT_I420;

-    else

-      ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;

-    ctx->preview_img.x_chroma_shift = 1;

-    ctx->preview_img.y_chroma_shift = 1;

-    ctx->preview_img.d_w = sd.y_width;

-    ctx->preview_img.d_h = sd.y_height;

-    ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;

-    ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;

-    ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;

-    ctx->preview_img.w   = sd.y_width;

-    ctx->preview_img.h   = sd.y_height;

+    yuvconfig2image(&ctx->preview_img, &sd, NULL);

     return &ctx->preview_img;

   } else

     return NULL;

-static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,

                                            int ctr_id,

                                            va_list args) {

   int update = va_arg(args, int);

@@ -984,7 +934,7 @@

-static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx,

                                              int ctr_id,

                                              va_list args) {

   int update = va_arg(args, int);

@@ -992,7 +942,7 @@

   return VPX_CODEC_OK;

-static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx,

                                           int ctr_id,

                                           va_list args) {

   int reference_flag = va_arg(args, int);

@@ -1000,7 +950,7 @@

   return VPX_CODEC_OK;

-static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx,

                                         int ctr_id,

                                         va_list args) {

   vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);

@@ -1018,7 +968,7 @@

-static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,

                                           int ctr_id,

                                           va_list args) {

   vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);

@@ -1035,7 +985,7 @@

     return VPX_CODEC_INVALID_PARAM;

-static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,

+static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,

                                           int ctr_id,

                                           va_list args) {

@@ -1056,16 +1006,16 @@

-static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {

-  {VP8_SET_REFERENCE,                 vp8e_set_reference},

-  {VP8_COPY_REFERENCE,                vp8e_copy_reference},

-  {VP8_SET_POSTPROC,                  vp8e_set_previewpp},

-  {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},

-  {VP8E_UPD_REFERENCE,                vp8e_update_reference},

-  {VP8E_USE_REFERENCE,                vp8e_use_reference},

-  {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},

-  {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},

-  {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},

+static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {

+  {VP8_SET_REFERENCE,                 vp9e_set_reference},

+  {VP8_COPY_REFERENCE,                vp9e_copy_reference},

+  {VP8_SET_POSTPROC,                  vp9e_set_previewpp},

+  {VP8E_UPD_ENTROPY,                  vp9e_update_entropy},

+  {VP8E_UPD_REFERENCE,                vp9e_update_reference},

+  {VP8E_USE_REFERENCE,                vp9e_use_reference},

+  {VP8E_SET_ROI_MAP,                  vp9e_set_roi_map},

+  {VP8E_SET_ACTIVEMAP,                vp9e_set_activemap},

+  {VP8E_SET_SCALEMODE,                vp9e_set_scalemode},

   {VP8E_SET_CPUUSED,                  set_param},

   {VP8E_SET_NOISE_SENSITIVITY,        set_param},

   {VP8E_SET_ENABLEAUTOALTREF,         set_param},

@@ -1086,7 +1036,7 @@

   { -1, NULL},

};

-static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {

+static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {

0,

@@ -1151,9 +1101,9 @@

   VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |

   VPX_CODEC_CAP_OUTPUT_PARTITION,

   /* vpx_codec_caps_t          caps; */

-  vp8e_init,          /* vpx_codec_init_fn_t       init; */

-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

+  vp9e_init,          /* vpx_codec_init_fn_t       init; */

+  vp9e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

+  vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

@@ -1163,12 +1113,12 @@

     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */

},

-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */

-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

-    vp8e_set_config,

+    vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

+    vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */

+    vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

+    vp9e_set_config,

     NOT_IMPLEMENTED,

-    vp8e_get_preview,

+    vp9e_get_preview,

   } /* encoder functions */

};

@@ -1180,9 +1130,9 @@

   VPX_CODEC_INTERNAL_ABI_VERSION,

   VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,

   /* vpx_codec_caps_t          caps; */

-  vp8e_exp_init,      /* vpx_codec_init_fn_t       init; */

-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

+  vp9e_exp_init,      /* vpx_codec_init_fn_t       init; */

+  vp9e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

+  vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

@@ -1192,12 +1142,12 @@

     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */

},

-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */

-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

-    vp8e_set_config,

+    vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

+    vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */

+    vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

+    vp9e_set_config,

     NOT_IMPLEMENTED,

-    vp8e_get_preview,

+    vp9e_get_preview,

   } /* encoder functions */

};

 #endif

--- a/vp9/vp9_dx_iface.c

+++ b/vp9/vp9_dx_iface.c

@@ -215,26 +215,19 @@

   if (data + data_sz <= data)

     res = VPX_CODEC_INVALID_PARAM;

   else {

-    /* Parse uncompresssed part of key frame header.

-     * 3 bytes:- including version, frame type and an offset

-     * 3 bytes:- sync code (0x9d, 0x01, 0x2a)

-     * 4 bytes:- including image width and height in the lowest 14 bits

-     *           of each 2-byte value.

-     */

     si->is_kf = 0;

-    if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */

-      const uint8_t *c = data + 3;

+    if (data_sz >= 8 && (data[0] & 0xD8) == 0x80) { /* I-Frame */

+      const uint8_t *c = data + 1;

       si->is_kf = 1;

-      /* vet via sync code */

-      if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)

+      if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2)

         res = VPX_CODEC_UNSUP_BITSTREAM;

-      si->w = (c[3] | (c[4] << 8));

-      si->h = (c[5] | (c[6] << 8));

+      si->w = (c[3] << 8) | c[4];

+      si->h = (c[5] << 8) | c[6];

-      /*printf("w=%d, h=%d\n", si->w, si->h);*/

+      // printf("w=%d, h=%d\n", si->w, si->h);

       if (!(si->h | si->w))

         res = VPX_CODEC_UNSUP_BITSTREAM;

     } else

@@ -242,7 +235,6 @@

   return res;

 static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,

@@ -329,9 +321,9 @@

       vp9_initialize_dec();

-      oxcf.Width = ctx->si.w;

-      oxcf.Height = ctx->si.h;

-      oxcf.Version = 9;

+      oxcf.width = ctx->si.w;

+      oxcf.height = ctx->si.h;

+      oxcf.version = 9;

       oxcf.postprocess = 0;

       oxcf.max_threads = ctx->cfg.threads;

       oxcf.inv_tile_order = ctx->invert_tile_order;

@@ -574,30 +566,6 @@

     vp8_finalize_mmaps(ctx->priv->alg_priv);

     res = ctx->iface->init(ctx, NULL);

-  return res;

-}

-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,

-                                       YV12_BUFFER_CONFIG  *yv12) {

-  vpx_codec_err_t        res = VPX_CODEC_OK;

-  yv12->y_buffer = img->planes[VPX_PLANE_Y];

-  yv12->u_buffer = img->planes[VPX_PLANE_U];

-  yv12->v_buffer = img->planes[VPX_PLANE_V];

-  yv12->y_crop_width  = img->d_w;

-  yv12->y_crop_height = img->d_h;

-  yv12->y_width  = img->d_w;

-  yv12->y_height = img->d_h;

-  yv12->uv_width = yv12->y_width / 2;

-  yv12->uv_height = yv12->y_height / 2;

-  yv12->y_stride = img->stride[VPX_PLANE_Y];

-  yv12->uv_stride = img->stride[VPX_PLANE_U];

-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;

-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||

-                   img->fmt == VPX_IMG_FMT_VPXYV12);

   return res;

--- a/vp9/vp9_iface_common.h

+++ b/vp9/vp9_iface_common.h

@@ -10,30 +10,39 @@

 #ifndef VP9_VP9_IFACE_COMMON_H_

 #define VP9_VP9_IFACE_COMMON_H_

-static void yuvconfig2image(vpx_image_t               *img,

-                            const YV12_BUFFER_CONFIG  *yv12,

-                            void                      *user_priv) {

+static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,

+                            void *user_priv) {

   /** vpx_img_wrap() doesn't allow specifying independent strides for

     * the Y, U, and V planes, nor other alignment adjustments that

     * might be representable by a YV12_BUFFER_CONFIG, so we just

     * initialize all the fields.*/

-  img->fmt = yv12->clrtype == REG_YUV ?

-             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;

+  int bps = 12;

+  if (yv12->uv_height == yv12->y_height) {

+    if (yv12->uv_width == yv12->y_width) {

+      img->fmt = VPX_IMG_FMT_I444;

+      bps = 24;

+    } else {

+      img->fmt = VPX_IMG_FMT_I422;

+      bps = 16;

+    }

+  } else {

+    img->fmt = VPX_IMG_FMT_I420;

+  }

   img->w = yv12->y_stride;

-  img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15;

-  img->d_w = yv12->y_width;

-  img->d_h = yv12->y_height;

-  img->x_chroma_shift = 1;

-  img->y_chroma_shift = 1;

+  img->h = multiple8(yv12->y_height + 2 * VP9BORDERINPIXELS);

+  img->d_w = yv12->y_crop_width;

+  img->d_h = yv12->y_crop_height;

+  img->x_chroma_shift = yv12->uv_width < yv12->y_width;

+  img->y_chroma_shift = yv12->uv_height < yv12->y_height;

   img->planes[VPX_PLANE_Y] = yv12->y_buffer;

   img->planes[VPX_PLANE_U] = yv12->u_buffer;

   img->planes[VPX_PLANE_V] = yv12->v_buffer;

-  img->planes[VPX_PLANE_ALPHA] = NULL;

+  img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer;

   img->stride[VPX_PLANE_Y] = yv12->y_stride;

   img->stride[VPX_PLANE_U] = yv12->uv_stride;

   img->stride[VPX_PLANE_V] = yv12->uv_stride;

-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;

-  img->bps = 12;

+  img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride;

+  img->bps = bps;

   img->user_priv = user_priv;

   img->img_data = yv12->buffer_alloc;

   img->img_data_owner = 0;

@@ -40,4 +49,41 @@

   img->self_allocd = 0;

+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,

+                                       YV12_BUFFER_CONFIG *yv12) {

+  yv12->y_buffer = img->planes[VPX_PLANE_Y];

+  yv12->u_buffer = img->planes[VPX_PLANE_U];

+  yv12->v_buffer = img->planes[VPX_PLANE_V];

+  yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA];

+  yv12->y_crop_width  = img->d_w;

+  yv12->y_crop_height = img->d_h;

+  yv12->y_width  = img->d_w;

+  yv12->y_height = img->d_h;

+  yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2

+                                            : yv12->y_width;

+  yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2

+                                             : yv12->y_height;

+  yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0;

+  yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0;

+  yv12->y_stride = img->stride[VPX_PLANE_Y];

+  yv12->uv_stride = img->stride[VPX_PLANE_U];

+  yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;

+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;

+  yv12->clrtype = REG_YUV;

+#if CONFIG_ALPHA

+  // For development purposes, force alpha to hold the same data a Y for now.

+  yv12->alpha_buffer = yv12->y_buffer;

+  yv12->alpha_width = yv12->y_width;

+  yv12->alpha_height = yv12->y_height;

+  yv12->alpha_stride = yv12->y_stride;

 #endif

+  return VPX_CODEC_OK;

+}

+#endif  // VP9_VP9_IFACE_COMMON_H_

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -17,16 +17,6 @@

 VP9_CX_SRCS-yes += vp9_cx_iface.c

-# encoder

-#INCLUDES += algo/vpx_common/vpx_mem/include

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += algo/vpx_ref/cpu_id/include

-#INCLUDES += common

-#INCLUDES += encoder

-VP9_CX_SRCS-yes += encoder/vp9_asm_enc_offsets.c

 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c

 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c

 VP9_CX_SRCS-yes += encoder/vp9_dct.c

@@ -38,6 +28,7 @@

 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c

 VP9_CX_SRCS-yes += encoder/vp9_block.h

 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.h

+VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h

 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h

 VP9_CX_SRCS-yes += encoder/vp9_encodeintra.h

 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h

@@ -82,7 +73,6 @@

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h

-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_x86.h

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm

@@ -95,7 +85,6 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm

-#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm

@@ -102,21 +91,10 @@

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm

-#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm

-#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm

-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm

 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c

-ifeq ($(HAVE_SSE2),yes)

-vp9/encoder/x86/vp9_dct_sse2.c.d: CFLAGS += -msse2

-vp9/encoder/x86/vp9_dct_sse2.c.o: CFLAGS += -msse2

-endif

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

-$(eval $(call asm_offsets_template,\

-         vp9_asm_enc_offsets.asm, $(VP9_PREFIX)encoder/vp9_asm_enc_offsets.c))

--- a/vp9/vp9dx.mk

+++ b/vp9/vp9dx.mk

@@ -22,11 +22,10 @@

 VP9_DX_SRCS-yes += decoder/vp9_decodemv.c

 VP9_DX_SRCS-yes += decoder/vp9_decodframe.c

 VP9_DX_SRCS-yes += decoder/vp9_decodframe.h

-VP9_DX_SRCS-yes += decoder/vp9_dequantize.c

 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c

 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.h

+VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h

 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h

-VP9_DX_SRCS-yes += decoder/vp9_dequantize.h

 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h

 VP9_DX_SRCS-yes += decoder/vp9_onyxd.h

 VP9_DX_SRCS-yes += decoder/vp9_onyxd_int.h

@@ -33,16 +32,11 @@

 VP9_DX_SRCS-yes += decoder/vp9_treereader.h

 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c

 VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c

+VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h

 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))

-VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c

 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c

-ifeq ($(HAVE_SSE2),yes)

-vp9/decoder/x86/vp9_dequantize_sse2.c.o: CFLAGS += -msse2

-vp9/decoder/x86/vp9_dequantize_sse2.c.d: CFLAGS += -msse2

-endif

 $(eval $(call asm_offsets_template,\

          vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))

--- a/vpx/vp8cx.h

+++ b/vpx/vp8cx.h

@@ -215,9 +215,13 @@

   unsigned char *roi_map;      /**< specify an id between 0 and 3 for each 16x16 region within a frame */

   unsigned int   rows;         /**< number of rows */

   unsigned int   cols;         /**< number of cols */

-  int     delta_q[4];          /**< quantizer delta [-63, 63] off baseline for regions with id between 0 and 3*/

-  int     delta_lf[4];         /**< loop filter strength delta [-63, 63] for regions with id between 0 and 3 */

-  unsigned int   static_threshold[4];/**< threshold for region to be treated as static */

+  // TODO(paulwilkins): broken for VP9 which has 8 segments

+  // q and loop filter deltas for each segment

+  // (see MAX_MB_SEGMENTS)

+  int     delta_q[4];

+  int     delta_lf[4];

+  // Static breakout threshold for each segment

+  unsigned int   static_threshold[4];

 } vpx_roi_map_t;

 /*!\brief  vpx active region map

--- a/vpx/vpx_image.h

+++ b/vpx/vpx_image.h

@@ -55,9 +55,11 @@

     VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */

     VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,

     VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */

-    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4   /** < planar 4:2:0 format with vpx color space */

-  }

-                        vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */

+    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,

+    VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,

+    VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,

+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7

+  } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */

 #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT

 #define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */

--- a/vpx_scale/generic/yv12config.c

+++ b/vpx_scale/generic/yv12config.c

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include "./vpx_config.h"

 #include "vpx_scale/yv12config.h"

 #include "vpx_mem/vpx_mem.h"

@@ -76,6 +76,10 @@

     ybf->uv_height = uv_height;

     ybf->uv_stride = uv_stride;

+    ybf->alpha_width = 0;

+    ybf->alpha_height = 0;

+    ybf->alpha_stride = 0;

     ybf->border = border;

     ybf->frame_size = frame_size;

@@ -82,6 +86,7 @@

     ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;

     ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * uv_stride) + border / 2;

     ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * uv_stride) + border / 2;

+    ybf->alpha_buffer = NULL;

     ybf->corrupted = 0; /* assume not currupted by errors */

     return 0;

@@ -97,3 +102,107 @@

   return -2;

+#if CONFIG_VP9

+// TODO(jkoleszar): Maybe replace this with struct vpx_image

+int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {

+  if (ybf) {

+    vpx_free(ybf->buffer_alloc);

+    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,

+      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out

+      all of this so that a freed pointer isn't inadvertently used */

+    vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));

+  } else {

+    return -1;

+  }

+  return 0;

+}

+int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

+                             int width, int height,

+                             int ss_x, int ss_y, int border) {

+  if (ybf) {

+    const int aligned_width = (width + 7) & ~7;

+    const int aligned_height = (height + 7) & ~7;

+    const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;

+    const int yplane_size = (aligned_height + 2 * border) * y_stride;

+    const int uv_width = aligned_width >> ss_x;

+    const int uv_height = aligned_height >> ss_y;

+    const int uv_stride = y_stride >> ss_x;

+    const int uv_border_w = border >> ss_x;

+    const int uv_border_h = border >> ss_y;

+    const int uvplane_size = (uv_height + 2 * uv_border_h) * uv_stride;

+#if CONFIG_ALPHA

+    const int alpha_width = aligned_width;

+    const int alpha_height = aligned_height;

+    const int alpha_stride = y_stride;

+    const int alpha_border_w = border;

+    const int alpha_border_h = border;

+    const int alpha_plane_size = (alpha_height + 2 * alpha_border_h) *

+                                 alpha_stride;

+    const int frame_size = yplane_size + 2 * uvplane_size +

+                           alpha_plane_size;

+#else

+    const int frame_size = yplane_size + 2 * uvplane_size;

+#endif

+    if (!ybf->buffer_alloc) {

+      ybf->buffer_alloc = vpx_memalign(32, frame_size);

+      ybf->buffer_alloc_sz = frame_size;

+    }

+    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size)

+      return -1;

+    /* Only support allocating buffers that have a border that's a multiple

+     * of 32. The border restriction is required to get 16-byte alignment of

+     * the start of the chroma rows without intoducing an arbitrary gap

+     * between planes, which would break the semantics of things like

+     * vpx_img_set_rect(). */

+    if (border & 0x1f)

+      return -3;

+    ybf->y_crop_width = width;

+    ybf->y_crop_height = height;

+    ybf->y_width  = aligned_width;

+    ybf->y_height = aligned_height;

+    ybf->y_stride = y_stride;

+    ybf->uv_width = uv_width;

+    ybf->uv_height = uv_height;

+    ybf->uv_stride = uv_stride;

+    ybf->border = border;

+    ybf->frame_size = frame_size;

+    ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;

+    ybf->u_buffer = ybf->buffer_alloc + yplane_size +

+                    (uv_border_h * uv_stride) + uv_border_w;

+    ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +

+                    (uv_border_h * uv_stride) + uv_border_w;

+#if CONFIG_ALPHA

+    ybf->alpha_width = alpha_width;

+    ybf->alpha_height = alpha_height;

+    ybf->alpha_stride = alpha_stride;

+    ybf->alpha_buffer = ybf->buffer_alloc + yplane_size + 2 * uvplane_size +

+                        (alpha_border_h * alpha_stride) + alpha_border_w;

+#endif

+    ybf->corrupted = 0; /* assume not currupted by errors */

+    return 0;

+  }

+  return -2;

+}

+int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

+                           int width, int height,

+                           int ss_x, int ss_y, int border) {

+  if (ybf) {

+    vp9_free_frame_buffer(ybf);

+    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border);

+  }

+  return -2;

+}

+#endif

--- a/vpx_scale/generic/yv12extend.c

+++ b/vpx_scale/generic/yv12extend.c

@@ -9,6 +9,7 @@

*/

 #include <assert.h>

+#include "./vpx_config.h"

 #include "vpx_scale/yv12config.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_scale/vpx_scale.h"

@@ -94,6 +95,36 @@

                (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);

+#if CONFIG_VP9

+void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,

+                                int subsampling_x, int subsampling_y) {

+  const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;

+  const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;

+  const int c_et = ybf->border >> subsampling_y;

+  const int c_el = ybf->border >> subsampling_x;

+  const int c_eb = (ybf->border + ybf->y_height - ybf->y_crop_height +

+                    subsampling_y) >> subsampling_y;

+  const int c_er = (ybf->border + ybf->y_width - ybf->y_crop_width +

+                    subsampling_x) >> subsampling_x;

+  assert(ybf->y_height - ybf->y_crop_height < 16);

+  assert(ybf->y_width - ybf->y_crop_width < 16);

+  assert(ybf->y_height - ybf->y_crop_height >= 0);

+  assert(ybf->y_width - ybf->y_crop_width >= 0);

+  extend_plane(ybf->y_buffer, ybf->y_stride,

+               ybf->y_crop_width, ybf->y_crop_height,

+               ybf->border, ybf->border,

+               ybf->border + ybf->y_height - ybf->y_crop_height,

+               ybf->border + ybf->y_width - ybf->y_crop_width);

+  extend_plane(ybf->u_buffer, ybf->uv_stride,

+               c_w, c_h, c_et, c_el, c_eb, c_er);

+  extend_plane(ybf->v_buffer, ybf->uv_stride,

+               c_w, c_h, c_et, c_el, c_eb, c_er);

+}

+#endif

 /****************************************************************************

--- a/vpx_scale/vpx_scale_rtcd.sh

+++ b/vpx_scale/vpx_scale_rtcd.sh

@@ -24,3 +24,8 @@

 prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"

 specialize vp8_yv12_copy_y neon

+if [ "$CONFIG_VP9" = "yes" ]; then

+    prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"

+    specialize vp9_extend_frame_borders

+fi

--- a/vpx_scale/yv12config.h

+++ b/vpx_scale/yv12config.h

@@ -18,7 +18,7 @@

 #include "vpx/vpx_integer.h"

 #define VP8BORDERINPIXELS       32

-#define VP9BORDERINPIXELS       64

+#define VP9BORDERINPIXELS       96

 #define VP9_INTERP_EXTEND        4

   /*************************************

@@ -52,9 +52,14 @@

     int   uv_stride;

     /*    int   uvinternal_width; */

+    int   alpha_width;

+    int   alpha_height;

+    int   alpha_stride;

     uint8_t *y_buffer;

     uint8_t *u_buffer;

     uint8_t *v_buffer;

+    uint8_t *alpha_buffer;

     uint8_t *buffer_alloc;

     int buffer_alloc_sz;

@@ -71,6 +76,14 @@

   int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

                                     int width, int height, int border);

   int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);

+  int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

+                             int width, int height, int ss_x, int ss_y,

+                             int border);

+  int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

+                               int width, int height, int ss_x, int ss_y,

+                               int border);

+  int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);

 #ifdef __cplusplus

--- a/vpxdec.c

+++ b/vpxdec.c

@@ -12,6 +12,7 @@

 /* This is a simple program that reads ivf files and decodes them

  * using the new interface. Decoded frames are output as YV12 raw.

*/

+#include <assert.h>

 #include <stdio.h>

 #include <stdlib.h>

 #include <stdarg.h>

@@ -891,6 +892,7 @@

   if (use_y4m && !noblit) {

     char buffer[128];

     if (!single_file) {

       fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"

               " try --i420 or --yv12.\n");

@@ -908,8 +910,8 @@

     /*Note: We can't output an aspect ratio here because IVF doesn't

        store one, and neither does VP8.

       That will have to wait until these tools support WebM natively.*/

-    sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",

-            "420jpeg", width, height, fps_num, fps_den, 'p');

+    snprintf(buffer, sizeof(buffer), "YUV4MPEG2 W%u H%u F%u:%u I%c ",

+             width, height, fps_num, fps_den, 'p');

     out_put(out, (unsigned char *)buffer,

             (unsigned int)strlen(buffer), do_md5);

@@ -1036,6 +1038,17 @@

       show_progress(frame_in, frame_out, dx_time);

     if (!noblit) {

+      if (frame_out == 1 && img && use_y4m) {

+        /* Write out the color format to terminate the header line */

+        const char *color =

+            img->fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :

+            img->fmt == VPX_IMG_FMT_I444 ? "C444\n" :

+            img->fmt == VPX_IMG_FMT_I422 ? "C422\n" :

+            "C420jpeg\n";

+        out_put(out, (const unsigned char*)color, strlen(color), do_md5);

+      }

       if (do_scale) {

         if (img && frame_out == 1) {

           stream_w = img->d_w;

@@ -1044,6 +1057,7 @@

                                      stream_w, stream_h, 16);

         if (img && (img->d_w != stream_w || img->d_h != stream_h)) {

+          assert(img->fmt == VPX_IMG_FMT_I420);

           I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],

                     img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],

                     img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],

@@ -1064,6 +1078,12 @@

         unsigned int y;

         char out_fn[PATH_MAX];

         uint8_t *buf;

+        unsigned int c_w =

+            img->x_chroma_shift ? (1 + img->d_w) >> img->x_chroma_shift

+                                : img->d_w;

+        unsigned int c_h =

+            img->y_chroma_shift ? (1 + img->d_h) >> img->y_chroma_shift

+                                : img->d_h;

         if (!single_file) {

           size_t len = sizeof(out_fn) - 1;

@@ -1084,15 +1104,15 @@

         buf = img->planes[flipuv ? VPX_PLANE_V : VPX_PLANE_U];

-        for (y = 0; y < (1 + img->d_h) / 2; y++) {

-          out_put(out, buf, (1 + img->d_w) / 2, do_md5);

+        for (y = 0; y < c_h; y++) {

+          out_put(out, buf, c_w, do_md5);

           buf += img->stride[VPX_PLANE_U];

         buf = img->planes[flipuv ? VPX_PLANE_U : VPX_PLANE_V];

-        for (y = 0; y < (1 + img->d_h) / 2; y++) {

-          out_put(out, buf, (1 + img->d_w) / 2, do_md5);

+        for (y = 0; y < c_h; y++) {

+          out_put(out, buf, c_w, do_md5);

           buf += img->stride[VPX_PLANE_V];

--- a/vpxenc.c

+++ b/vpxenc.c

@@ -326,6 +326,7 @@

   unsigned int          h;

   struct vpx_rational   framerate;

   int                   use_i420;

+  int                   only_i420;

};

@@ -1481,9 +1482,12 @@

 #define mmin(a, b)  ((a) < (b) ? (a) : (b))

 static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2,

-                          int yloc[2], int uloc[2], int vloc[2]) {

+                          int yloc[4], int uloc[4], int vloc[4]) {

   const unsigned int bsize = 64;

-  const unsigned int bsize2 = bsize >> 1;

+  const unsigned int bsizey = bsize >> img1->y_chroma_shift;

+  const unsigned int bsizex = bsize >> img1->x_chroma_shift;

+  const int c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;

+  const int c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;

   unsigned int match = 1;

   unsigned int i, j;

   yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;

@@ -1510,12 +1514,13 @@

   uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;

-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {

-    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {

+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {

+    for (j = 0; match && j < c_w; j += bsizex) {

       int k, l;

-      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;

-      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;

+      int si = mmin(i + bsizey, c_h - i);

+      int sj = mmin(j + bsizex, c_w - j);

       for (k = 0; match && k < si; k++)

         for (l = 0; match && l < sj; l++) {

           if (*(img1->planes[VPX_PLANE_U] +

@@ -1535,11 +1540,11 @@

   vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;

-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {

-    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {

+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {

+    for (j = 0; match && j < c_w; j += bsizex) {

       int k, l;

-      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;

-      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;

+      int si = mmin(i + bsizey, c_h - i);

+      int sj = mmin(j + bsizex, c_w - j);

       for (k = 0; match && k < si; k++)

         for (l = 0; match && l < sj; l++) {

           if (*(img1->planes[VPX_PLANE_V] +

@@ -1562,6 +1567,8 @@

 static int compare_img(vpx_image_t *img1, vpx_image_t *img2)

+  const int c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;

+  const int c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;

   int match = 1;

   unsigned int i;

@@ -1574,15 +1581,15 @@

                      img2->planes[VPX_PLANE_Y]+i*img2->stride[VPX_PLANE_Y],

                      img1->d_w) == 0);

-  for (i = 0; i < img1->d_h/2; i++)

+  for (i = 0; i < c_h; i++)

     match &= (memcmp(img1->planes[VPX_PLANE_U]+i*img1->stride[VPX_PLANE_U],

                      img2->planes[VPX_PLANE_U]+i*img2->stride[VPX_PLANE_U],

-                     (img1->d_w + 1) / 2) == 0);

+                     c_w) == 0);

-  for (i = 0; i < img1->d_h/2; i++)

+  for (i = 0; i < c_h; i++)

     match &= (memcmp(img1->planes[VPX_PLANE_V]+i*img1->stride[VPX_PLANE_U],

                      img2->planes[VPX_PLANE_V]+i*img2->stride[VPX_PLANE_U],

-                     (img1->d_w + 1) / 2) == 0);

+                     c_w) == 0);

   return match;

@@ -1792,7 +1799,8 @@

   if (input->detect.buf_read == 4

       && file_is_y4m(input->file, &input->y4m, input->detect.buf)) {

-    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4) >= 0) {

+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,

+                       input->only_i420) >= 0) {

       input->file_type = FILE_TYPE_Y4M;

       input->w = input->y4m.pic_w;

       input->h = input->y4m.pic_h;

@@ -2516,6 +2524,7 @@

   input.framerate.num = 30;

   input.framerate.den = 1;

   input.use_i420 = 1;

+  input.only_i420 = 1;

   /* First parse the global configuration values, because we want to apply

    * other parameters on top of the default configuration provided by the

@@ -2549,6 +2558,12 @@

   if (!input.fn)

     usage_exit();

+#if CONFIG_NON420

+  /* Decide if other chroma subsamplings than 4:2:0 are supported */

+  if (global.codec->fourcc == VP9_FOURCC)

+    input.only_i420 = 0;

+#endif

   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {

     int frames_in = 0, seen_frames = 0;

--- a/y4minput.c

+++ b/y4minput.c

@@ -659,7 +659,8 @@

                              unsigned char *_aux) {

-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip) {

+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,

+                   int only_420) {

   char buffer[80];

   int  ret;

   int  i;

@@ -701,6 +702,8 @@

             "Only progressive scan handled.\n");

     return -1;

+  _y4m->vpx_fmt = VPX_IMG_FMT_I420;

+  _y4m->vpx_bps = 12;

   if (strcmp(_y4m->chroma_type, "420") == 0 ||

       strcmp(_y4m->chroma_type, "420jpeg") == 0) {

     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;

@@ -734,16 +737,30 @@

     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

     _y4m->convert = y4m_convert_422jpeg_420jpeg;

   } else if (strcmp(_y4m->chroma_type, "422") == 0) {

-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;

+    _y4m->src_c_dec_h = 2;

     _y4m->src_c_dec_v = 1;

-    _y4m->dst_c_dec_v = 2;

-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;

-    /*Chroma filter required: read into the aux buf first.

-      We need to make two filter passes, so we need some extra space in the

-       aux buffer.*/

-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

-    _y4m->convert = y4m_convert_422_420jpeg;

+    if (only_420) {

+      _y4m->dst_c_dec_h = 2;

+      _y4m->dst_c_dec_v = 2;

+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;

+      /*Chroma filter required: read into the aux buf first.

+        We need to make two filter passes, so we need some extra space in the

+         aux buffer.*/

+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +

+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

+      _y4m->convert = y4m_convert_422_420jpeg;

+    } else {

+      _y4m->vpx_fmt = VPX_IMG_FMT_I422;

+      _y4m->vpx_bps = 16;

+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;

+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;

+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h

+                              + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

+      /*Natively supported: no conversion required.*/

+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;

+      _y4m->convert = y4m_convert_null;

+      }

   } else if (strcmp(_y4m->chroma_type, "411") == 0) {

     _y4m->src_c_dec_h = 4;

     _y4m->dst_c_dec_h = 2;

@@ -758,29 +775,52 @@

     _y4m->convert = y4m_convert_411_420jpeg;

   } else if (strcmp(_y4m->chroma_type, "444") == 0) {

     _y4m->src_c_dec_h = 1;

-    _y4m->dst_c_dec_h = 2;

     _y4m->src_c_dec_v = 1;

-    _y4m->dst_c_dec_v = 2;

-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;

-    /*Chroma filter required: read into the aux buf first.

-      We need to make two filter passes, so we need some extra space in the

-       aux buffer.*/

-    _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;

-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

-    _y4m->convert = y4m_convert_444_420jpeg;

+    if (only_420) {

+      _y4m->dst_c_dec_h = 2;

+      _y4m->dst_c_dec_v = 2;

+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;

+      /*Chroma filter required: read into the aux buf first.

+        We need to make two filter passes, so we need some extra space in the

+         aux buffer.*/

+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;

+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +

+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;

+      _y4m->convert = y4m_convert_444_420jpeg;

+    } else {

+      _y4m->vpx_fmt = VPX_IMG_FMT_I444;

+      _y4m->vpx_bps = 24;

+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;

+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;

+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;

+      /*Natively supported: no conversion required.*/

+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;

+      _y4m->convert = y4m_convert_null;

+    }

   } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {

     _y4m->src_c_dec_h = 1;

-    _y4m->dst_c_dec_h = 2;

     _y4m->src_c_dec_v = 1;

-    _y4m->dst_c_dec_v = 2;

-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;

-    /*Chroma filter required: read into the aux buf first.

-      We need to make two filter passes, so we need some extra space in the

-       aux buffer.

-      The extra plane also gets read into the aux buf.

-      It will be discarded.*/

-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;

-    _y4m->convert = y4m_convert_444_420jpeg;

+    if (only_420) {

+      _y4m->dst_c_dec_h = 2;

+      _y4m->dst_c_dec_v = 2;

+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;

+      /*Chroma filter required: read into the aux buf first.

+        We need to make two filter passes, so we need some extra space in the

+         aux buffer.

+        The extra plane also gets read into the aux buf.

+        It will be discarded.*/

+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;

+      _y4m->convert = y4m_convert_444_420jpeg;

+    } else {

+      _y4m->vpx_fmt = VPX_IMG_FMT_444A;

+      _y4m->vpx_bps = 32;

+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;

+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;

+      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;

+      /*Natively supported: no conversion required.*/

+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;

+      _y4m->convert = y4m_convert_null;

+    }

   } else if (strcmp(_y4m->chroma_type, "mono") == 0) {

     _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;

     _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;

@@ -847,22 +887,23 @@

      sizes, which would require a separate fread call for every row.*/

   memset(_img, 0, sizeof(*_img));

   /*Y4M has the planes in Y'CbCr order, which libvpx calls Y, U, and V.*/

-  _img->fmt = IMG_FMT_I420;

+  _img->fmt = _y4m->vpx_fmt;

   _img->w = _img->d_w = _y4m->pic_w;

   _img->h = _img->d_h = _y4m->pic_h;

-  /*This is hard-coded to 4:2:0 for now, as that's all VP8 supports.*/

-  _img->x_chroma_shift = 1;

-  _img->y_chroma_shift = 1;

-  _img->bps = 12;

+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;

+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;

+  _img->bps = _y4m->vpx_bps;

   /*Set up the buffer pointers.*/

   pic_sz = _y4m->pic_w * _y4m->pic_h;

   c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;

   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;

   c_sz = c_w * c_h;

-  _img->stride[PLANE_Y] = _y4m->pic_w;

+  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] = _y4m->pic_w;

   _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;

   _img->planes[PLANE_Y] = _y4m->dst_buf;

   _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;

   _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;

+  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;

   return 1;

--- a/y4minput.h

+++ b/y4minput.h

@@ -51,9 +51,12 @@

   y4m_convert_func  convert;

   unsigned char    *dst_buf;

   unsigned char    *aux_buf;

+  enum vpx_img_fmt  vpx_fmt;

+  int               vpx_bps;

};

-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip);

+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,

+                   int only_420);

 void y4m_input_close(y4m_input *_y4m);

 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);