shithub: libvpx

Download patch

ref: 4209bba4627bb148fce5b8b6caa4de5ac526fa92
parent: 764be4f66fc3928ec668f8faccef1705049ff5a5
parent: 97dd7342b8591627b7dd07cb79f91b544213caa9
author: Ronald S. Bultje <[email protected]>
date: Tue Mar 5 06:17:14 EST 2013

Merge changes Ifacbf5a0,Ibad7c3dd into experimental

* changes:
  vpxenc: actually report mismatch on stderr.
  Make superblocks independent of macroblock code and data.

--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -12,15 +12,431 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-const uint8_t vp9_block2left[TX_SIZE_MAX_SB][24] = {
-  {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7},
-  {0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6}
+const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24] = {
+  { 0, 0, 0, 0,
+    1, 1, 1, 1,
+    2, 2, 2, 2,
+    3, 3, 3, 3,
+    4, 4,
+    5, 5,
+    6, 6,
+    7, 7 },
+  { 0, 0, 0, 0,
+    0, 0, 0, 0,
+    2, 2, 2, 2,
+    2, 2, 2, 2,
+    4, 4,
+    4, 4,
+    6, 6,
+    6, 6 },
+  { 0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0 },
 };
-const uint8_t vp9_block2above[TX_SIZE_MAX_SB][24] = {
-  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7},
-  {0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6},
-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6}
+const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24] = {
+  { 0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    4, 5,
+    4, 5,
+    6, 7,
+    6, 7 },
+  { 0, 0, 0, 0,
+    2, 2, 2, 2,
+    0, 0, 0, 0,
+    2, 2, 2, 2,
+    4, 4,
+    4, 4,
+    6, 6,
+    6, 6 },
+  { 0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, 0, 0 },
 };
+
+#define S(x) x + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT)
+const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),
+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
+    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),
+    4, 4, 4, 4,
+    5, 5, 5, 5,
+    S(4), S(4), S(4), S(4),
+    S(5), S(5), S(5), S(5),
+    6, 6, 6, 6,
+    7, 7, 7, 7,
+    S(6), S(6), S(6), S(6),
+    S(7), S(7), S(7), S(7) },
+  { 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
+    4, 4, 4, 4,
+    4, 4, 4, 4,
+    S(4), S(4), S(4), S(4),
+    S(4), S(4), S(4), S(4),
+    6, 6, 6, 6,
+    6, 6, 6, 6,
+    S(6), S(6), S(6), S(6),
+    S(6), S(6), S(6), S(6) },
+  { 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    4, 4, 4, 4,
+    4, 4, 4, 4,
+    4, 4, 4, 4,
+    4, 4, 4, 4,
+    6, 6, 6, 6,
+    6, 6, 6, 6,
+    6, 6, 6, 6,
+    6, 6, 6, 6 },
+  { 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0 },
+};
+const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96] = {
+  { 0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3),
+    4, 5, S(4), S(5),
+    4, 5, S(4), S(5),
+    4, 5, S(4), S(5),
+    4, 5, S(4), S(5),
+    6, 7, S(6), S(7),
+    6, 7, S(6), S(7),
+    6, 7, S(6), S(7),
+    6, 7, S(6), S(7) },
+  { 0, 0, 0, 0, 2, 2, 2, 2,
+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    0, 0, 0, 0, 2, 2, 2, 2,
+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    0, 0, 0, 0, 2, 2, 2, 2,
+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    0, 0, 0, 0, 2, 2, 2, 2,
+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    4, 4, 4, 4,
+    S(4), S(4), S(4), S(4),
+    4, 4, 4, 4,
+    S(4), S(4), S(4), S(4),
+    6, 6, 6, 6,
+    S(6), S(6), S(6), S(6),
+    6, 6, 6, 6,
+    S(6), S(6), S(6), S(6) },
+  { 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    4, 4, 4, 4,
+    4, 4, 4, 4,
+    4, 4, 4, 4,
+    4, 4, 4, 4,
+    6, 6, 6, 6,
+    6, 6, 6, 6,
+    6, 6, 6, 6,
+    6, 6, 6, 6 },
+  { 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+#define T(x) x + 2 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))
+#define U(x) x + 3 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))
+const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),
+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
+    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1),
+    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),
+    T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1),
+    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),
+    U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3),
+    4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5,
+    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),
+    S(5), S(5), S(5), S(5), S(5), S(5), S(5), S(5),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    T(5), T(5), T(5), T(5), T(5), T(5), T(5), T(5),
+    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),
+    U(5), U(5), U(5), U(5), U(5), U(5), U(5), U(5),
+    6, 6, 6, 6, 6, 6, 6, 6,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),
+    S(7), S(7), S(7), S(7), S(7), S(7), S(7), S(7),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    T(7), T(7), T(7), T(7), T(7), T(7), T(7), T(7),
+    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),
+    U(7), U(7), U(7), U(7), U(7), U(7), U(7), U(7) },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),
+    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),
+    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),
+    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),
+    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),
+    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),
+    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6) },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6 },
+};
+const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384] = {
+  { 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7) },
+  { 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
+    4, 4, 4, 4, S(4), S(4), S(4), S(4),
+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
+    4, 4, 4, 4, S(4), S(4), S(4), S(4),
+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
+    4, 4, 4, 4, S(4), S(4), S(4), S(4),
+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
+    4, 4, 4, 4, S(4), S(4), S(4), S(4),
+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
+    6, 6, 6, 6, S(6), S(6), S(6), S(6),
+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),
+    6, 6, 6, 6, S(6), S(6), S(6), S(6),
+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),
+    6, 6, 6, 6, S(6), S(6), S(6), S(6),
+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),
+    6, 6, 6, 6, S(6), S(6), S(6), S(6),
+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6) },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6 },
+};
+#undef U
+#undef T
+#undef S
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -277,13 +277,6 @@
   union b_mode_info bmi;
 } BLOCKD;
 
-typedef struct superblockd {
-  /* 32x32 Y and 16x16 U/V */
-  DECLARE_ALIGNED(16, int16_t, diff[32*32+16*16*2]);
-  DECLARE_ALIGNED(16, int16_t, qcoeff[32*32+16*16*2]);
-  DECLARE_ALIGNED(16, int16_t, dqcoeff[32*32+16*16*2]);
-} SUPERBLOCKD;
-
 struct scale_factors {
   int x_num;
   int x_den;
@@ -297,14 +290,12 @@
 };
 
 typedef struct macroblockd {
-  DECLARE_ALIGNED(16, int16_t,  diff[384]);      /* from idct diff */
-  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[384]);
-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[384]);
-  DECLARE_ALIGNED(16, uint16_t, eobs[24]);
+  DECLARE_ALIGNED(16, int16_t,  diff[64*64+32*32*2]);      /* from idct diff */
+  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);  // unused for superblocks
+  DECLARE_ALIGNED(16, int16_t,  qcoeff[64*64+32*32*2]);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64*64+32*32*2]);
+  DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]);
 
-  SUPERBLOCKD sb_coeff_data;
-
   /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */
   BLOCKD block[24];
   int fullpixel_mask;
@@ -451,8 +442,12 @@
   }
 }
 
-extern const uint8_t vp9_block2left[TX_SIZE_MAX_SB][24];
-extern const uint8_t vp9_block2above[TX_SIZE_MAX_SB][24];
+extern const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24];
+extern const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24];
+extern const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96];
+extern const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96];
+extern const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384];
+extern const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384];
 
 #define USE_ADST_FOR_I16X16_8X8   0
 #define USE_ADST_FOR_I16X16_4X4   0
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -270,85 +270,85 @@
   }, { /* block Type 1 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 202,  29, 181, 221, 168, 177, 217, 162, 235, 202, 157 },
-        { 117,  39, 146, 207, 155, 172, 203, 155, 236, 192, 208 },
-        {  46,  40,  99, 171, 136, 161, 176, 140, 229, 177, 208 }
+        { 210,  33, 210, 232, 185, 185, 210, 166, 207, 192, 146 },
+        { 118,  47, 169, 220, 170, 179, 201, 160, 231, 183, 211 },
+        {  40,  52, 119, 203, 146, 169, 207, 160, 242, 194, 222 }
       }, { /* Coeff Band 1 */
-        {   1, 138, 204, 227, 179, 181, 224, 161, 249, 203, 237 },
-        { 116, 138, 209, 227, 179, 180, 222, 165, 248, 204, 241 },
-        {  63, 112, 184, 227, 183, 178, 223, 167, 248, 206, 237 },
-        {  47,  84, 140, 219, 163, 177, 223, 160, 249, 207, 241 },
-        {  25,  53,  76, 179, 120, 156, 217, 152, 248, 205, 232 },
-        {  10,  23,  29,  76,  91, 132, 145, 109, 228, 169, 214 }
+        {   1, 158, 215, 239, 192, 188, 234, 174, 253, 219, 230 },
+        { 130, 149, 210, 238, 191, 188, 233, 174, 253, 221, 240 },
+        {  59, 123, 193, 237, 188, 187, 232, 174, 252, 220, 246 },
+        {  22,  89, 154, 232, 172, 183, 233, 173, 253, 219, 237 },
+        {   4,  49,  83, 193, 128, 160, 227, 161, 253, 219, 233 },
+        {   1,  18,  27,  87,  90, 133, 160, 112, 242, 185, 231 }
       }, { /* Coeff Band 2 */
-        {   1,  69, 198, 223, 179, 177, 225, 154, 251, 208, 227 },
-        {  78,  78, 170, 223, 170, 179, 218, 162, 248, 203, 245 },
-        {  26,  69, 117, 209, 154, 170, 215, 160, 249, 205, 239 },
-        {  16,  54,  79, 180, 119, 156, 208, 151, 248, 201, 238 },
-        {  12,  43,  45, 119, 102, 142, 186, 126, 245, 193, 236 },
-        {   1,  24,  22,  60,  92, 133, 114,  99, 221, 154, 210 }
+        {   1,  87, 205, 244, 192, 193, 239, 188, 252, 220, 217 },
+        {  64,  93, 169, 237, 175, 186, 237, 184, 253, 222, 235 },
+        {  19,  77, 130, 222, 154, 175, 231, 173, 253, 221, 223 },
+        {   6,  59,  95, 196, 132, 162, 223, 160, 251, 215, 240 },
+        {   1,  37,  57, 144, 109, 146, 201, 135, 250, 205, 238 },
+        {   1,  17,  26,  81,  94, 138, 135, 107, 232, 168, 223 }
       }, { /* Coeff Band 3 */
-        {   1, 135, 214, 222, 183, 178, 230, 144, 252, 208, 241 },
-        { 107, 122, 201, 229, 181, 182, 221, 165, 250, 202, 243 },
-        {  38, 100, 168, 221, 168, 176, 220, 166, 250, 208, 240 },
-        {  21,  83, 125, 206, 149, 167, 217, 160, 250, 209, 238 },
-        {  16,  65,  80, 164, 122, 156, 208, 139, 250, 206, 246 },
-        {   3,  37,  43, 104, 103, 143, 156, 118, 237, 173, 227 }
+        {   1, 150, 219, 243, 198, 192, 237, 182, 253, 227, 245 },
+        {  88, 130, 202, 239, 190, 188, 236, 180, 253, 224, 255 },
+        {  25, 103, 172, 231, 175, 182, 234, 174, 253, 227, 248 },
+        {   7,  78, 128, 215, 156, 172, 228, 166, 252, 222, 248 },
+        {   1,  48,  76, 175, 121, 155, 212, 149, 251, 213, 237 },
+        {   1,  22,  35, 101,  97, 141, 161, 120, 236, 181, 213 }
       }, { /* Coeff Band 4 */
-        {   1, 169, 223, 233, 193, 184, 234, 150, 254, 206, 243 },
-        {  83, 140, 201, 233, 184, 185, 228, 168, 252, 203, 223 },
-        {  19, 104, 158, 225, 168, 179, 228, 169, 253, 207, 248 },
-        {  10,  76, 117, 209, 145, 168, 223, 166, 252, 210, 243 },
-        {   8,  59,  79, 163, 119, 153, 213, 142, 250, 205, 230 },
-        {   1,  31,  43, 100, 103, 144, 149, 116, 240, 171, 221 }
+        {   1, 177, 228, 247, 206, 197, 243, 191, 255, 232, 255 },
+        {  76, 143, 205, 243, 192, 192, 241, 189, 253, 223, 255 },
+        {  17, 107, 163, 233, 170, 183, 239, 183, 253, 227, 218 },
+        {   3,  75, 118, 216, 147, 171, 234, 174, 253, 220, 249 },
+        {   1,  43,  71, 174, 118, 154, 217, 153, 250, 211, 240 },
+        {   1,  19,  31,  93,  93, 136, 154, 116, 235, 178, 228 }
       }, { /* Coeff Band 5 */
-        {   1, 190, 234, 247, 211, 197, 239, 172, 255, 208, 236 },
-        {  65, 152, 218, 244, 199, 194, 236, 184, 252, 199, 249 },
-        {  17, 109, 173, 237, 179, 186, 235, 183, 250, 205, 255 },
-        {   6,  78, 127, 219, 153, 173, 231, 177, 251, 210, 249 },
-        {   3,  56,  77, 172, 121, 157, 215, 152, 249, 209, 247 },
-        {   1,  29,  38,  96,  97, 144, 152, 114, 239, 169, 243 }
+        {   1, 192, 230, 251, 215, 205, 245, 201, 254, 229, 255 },
+        {  66, 142, 206, 248, 200, 202, 244, 197, 255, 224, 255 },
+        {  21, 107, 166, 241, 176, 191, 241, 192, 253, 230, 255 },
+        {   5,  79, 129, 221, 150, 173, 237, 178, 254, 226, 255 },
+        {   1,  43,  72, 173, 117, 151, 217, 150, 253, 216, 245 },
+        {   1,  17,  28,  93,  95, 139, 162, 114, 245, 187, 235 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 223,  71, 225, 221, 176, 169, 242, 165, 248, 216, 201 },
-        { 147,  79, 197, 215, 175, 172, 230, 154, 243, 203, 184 },
-        {  69,  75, 152, 197, 158, 168, 203, 144, 231, 187, 177 }
+        { 235,  68, 250, 244, 206, 192, 229, 177, 248, 215, 237 },
+        { 169,  88, 225, 235, 191, 184, 222, 170, 246, 205, 237 },
+        {  65, 100, 171, 214, 166, 173, 216, 157, 249, 213, 215 }
       }, { /* Coeff Band 1 */
-        {   1, 168, 219, 195, 168, 151, 249, 131, 255, 221, 255 },
-        { 152, 156, 226, 210, 189, 173, 240, 121, 255, 215, 238 },
-        {  82, 128, 198, 239, 201, 194, 220, 151, 254, 202, 251 },
-        {  74, 107, 150, 236, 163, 187, 222, 177, 255, 204, 255 },
-        {  59, 103, 120, 181, 125, 148, 232, 157, 255, 219, 245 },
-        {  21,  63,  84, 129, 122, 150, 171, 118, 246, 196, 226 }
+        {   1, 191, 246, 250, 217, 202, 244, 195, 255, 226, 128 },
+        { 177, 169, 236, 250, 216, 201, 244, 194, 251, 228, 255 },
+        {  70, 132, 205, 250, 209, 205, 246, 193, 254, 246, 255 },
+        {  41, 108, 165, 244, 172, 194, 246, 202, 255, 229, 255 },
+        {  23,  84, 126, 207, 140, 162, 244, 179, 254, 237, 255 },
+        {  11,  57,  83, 149, 127, 156, 180, 126, 247, 202, 220 }
       }, { /* Coeff Band 2 */
-        {   1, 133, 219, 202, 174, 158, 244, 133, 255, 214, 237 },
-        { 101, 132, 204, 221, 187, 183, 225, 131, 253, 201, 247 },
-        {  41, 107, 147, 228, 174, 187, 211, 162, 252, 201, 246 },
-        {  40, 107, 107, 205, 129, 162, 213, 164, 252, 206, 232 },
-        {  24, 140,  90, 122, 111, 141, 210, 127, 251, 208, 239 },
-        {   1,  59,  55,  91, 111, 141, 144, 109, 241, 180, 226 }
+        {   1, 169, 240, 250, 212, 202, 242, 192, 252, 222, 255 },
+        { 105, 151, 215, 246, 200, 197, 240, 190, 253, 221, 255 },
+        {  24, 111, 166, 237, 177, 188, 236, 183, 252, 213, 255 },
+        {   9,  83, 122, 218, 148, 170, 233, 174, 250, 215, 242 },
+        {   1,  55,  77, 168, 118, 152, 215, 150, 248, 213, 226 },
+        {   1,  26,  36, 104,  98, 146, 149, 116, 235, 182, 225 }
       }, { /* Coeff Band 3 */
-        {   1, 170, 226, 200, 179, 153, 245, 138, 255, 214, 241 },
-        { 111, 149, 217, 226, 194, 186, 223, 137, 255, 211, 253 },
-        {  40, 113, 174, 228, 180, 183, 211, 165, 255, 212, 247 },
-        {  44, 101, 126, 210, 151, 167, 212, 161, 255, 217, 241 },
-        {  43, 131, 103, 146, 119, 148, 211, 136, 254, 216, 250 },
-        {   1,  57,  63, 112, 116, 145, 158, 115, 249, 193, 236 }
+        {   1, 191, 243, 251, 219, 204, 246, 196, 255, 230, 128 },
+        {  97, 168, 225, 248, 207, 198, 244, 193, 254, 225, 192 },
+        {  15, 122, 182, 241, 187, 188, 241, 190, 251, 231, 228 },
+        {   3,  83, 131, 226, 160, 178, 237, 180, 251, 222, 205 },
+        {   1,  49,  77, 184, 121, 155, 222, 159, 249, 216, 249 },
+        {   1,  21,  32,  98,  98, 140, 152, 113, 233, 173, 243 }
       }, { /* Coeff Band 4 */
-        {   1, 186, 233, 216, 191, 163, 241, 143, 255, 210, 255 },
-        {  91, 161, 214, 225, 190, 181, 224, 150, 255, 212, 253 },
-        {  26, 117, 163, 220, 172, 180, 218, 148, 255, 215, 252 },
-        {  27,  90, 122, 203, 143, 167, 212, 159, 255, 213, 255 },
-        {  21,  98, 113, 163, 130, 153, 208, 141, 255, 215, 248 },
-        {   1,  47,  66, 130, 118, 151, 167, 123, 252, 199, 235 }
+        {   1, 202, 242, 253, 226, 212, 245, 205, 254, 226, 255 },
+        {  83, 168, 219, 252, 212, 211, 244, 200, 250, 215, 255 },
+        {   9, 143, 174, 245, 183, 197, 241, 194, 254, 217, 255 },
+        {   1, 105, 129, 228, 154, 179, 233, 179, 253, 211, 255 },
+        {   1,  47,  72, 177, 116, 152, 214, 157, 251, 209, 255 },
+        {   1,  18,  26,  79,  94, 137, 150, 109, 246, 175, 248 }
       }, { /* Coeff Band 5 */
-        {   1, 195, 236, 245, 211, 195, 238, 171, 255, 209, 248 },
-        {  65, 156, 218, 245, 200, 196, 230, 185, 255, 212, 248 },
-        {  13, 112, 172, 238, 180, 189, 231, 185, 255, 213, 250 },
-        {   6,  83, 130, 224, 155, 177, 227, 180, 255, 214, 244 },
-        {   5,  71,  91, 185, 133, 160, 214, 154, 254, 212, 248 },
-        {   1,  45,  63, 128, 112, 147, 169, 129, 248, 190, 236 }
+        {   1, 205, 236, 254, 233, 221, 247, 201, 255, 220, 128 },
+        {  87, 149, 205, 254, 211, 219, 245, 207, 255, 239, 128 },
+        {  56, 122, 162, 248, 164, 195, 246, 211, 255, 231, 128 },
+        {  26, 108, 163, 224, 149, 169, 240, 187, 255, 238, 255 },
+        {   1,  54,  89, 171, 123, 152, 219, 148, 254, 226, 255 },
+        {   1,  21,  34,  99,  90, 140, 174, 112, 252, 210, 255 }
       }
     }
   }
@@ -441,90 +441,90 @@
   }, { /* block Type 1 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 198,  28, 192, 217, 170, 174, 201, 162, 219, 179, 159 },
-        {  96,  36, 145, 198, 153, 167, 193, 153, 222, 180, 177 },
-        {  31,  35,  89, 156, 131, 157, 166, 136, 214, 170, 178 }
+        { 203,  35, 218, 235, 189, 187, 194, 174, 175, 150, 127 },
+        {  95,  50, 155, 211, 161, 173, 190, 163, 198, 161, 187 },
+        {  21,  46,  93, 178, 130, 157, 200, 151, 224, 186, 191 }
       }, { /* Coeff Band 1 */
-        {   1, 138, 202, 225, 174, 178, 218, 164, 243, 200, 201 },
-        { 147, 134, 202, 223, 174, 177, 215, 162, 243, 204, 220 },
-        {  65, 115, 179, 224, 176, 177, 215, 162, 243, 202, 227 },
-        {  25,  86, 141, 217, 163, 177, 216, 159, 243, 201, 225 },
-        {   6,  48,  79, 181, 125, 157, 209, 151, 244, 201, 212 },
-        {   1,  16,  25,  77,  91, 134, 132, 112, 210, 162, 180 }
+        {   1, 155, 198, 236, 183, 187, 223, 175, 250, 209, 255 },
+        { 115, 147, 192, 235, 182, 186, 222, 173, 244, 199, 222 },
+        {  43, 124, 174, 234, 178, 186, 222, 176, 249, 201, 255 },
+        {  13,  96, 143, 227, 164, 181, 223, 174, 248, 197, 237 },
+        {   2,  59,  91, 197, 131, 163, 213, 162, 246, 198, 241 },
+        {   1,  19,  29,  85,  96, 139, 128, 116, 215, 153, 204 }
       }, { /* Coeff Band 2 */
-        {   1,  78, 195, 222, 172, 177, 219, 162, 245, 205, 227 },
-        {  67,  79, 154, 211, 158, 171, 212, 159, 243, 201, 222 },
-        {  18,  63, 108, 192, 140, 163, 205, 152, 242, 197, 214 },
-        {   6,  49,  77, 163, 121, 154, 192, 142, 239, 191, 216 },
-        {   1,  34,  49, 112, 106, 143, 160, 122, 233, 178, 213 },
-        {   1,  14,  20,  56,  93, 135,  94, 102, 189, 141, 170 }
+        {   1,  91, 180, 231, 170, 180, 237, 181, 248, 213, 230 },
+        {  39,  83, 139, 220, 153, 173, 233, 179, 243, 200, 228 },
+        {  12,  63, 106, 203, 136, 163, 227, 170, 244, 200, 234 },
+        {   5,  48,  79, 178, 123, 154, 215, 155, 244, 197, 232 },
+        {   1,  32,  50, 125, 104, 144, 171, 130, 238, 181, 229 },
+        {   1,  12,  18,  54,  88, 131,  92,  99, 201, 142, 193 }
       }, { /* Coeff Band 3 */
-        {   1, 137, 210, 229, 182, 181, 223, 164, 247, 214, 201 },
-        {  89, 123, 189, 226, 176, 180, 217, 165, 245, 207, 216 },
-        {  24, 100, 155, 217, 162, 176, 215, 163, 242, 198, 215 },
-        {   8,  78, 121, 199, 147, 167, 206, 155, 241, 198, 212 },
-        {   2,  52,  81, 161, 125, 156, 185, 139, 236, 186, 207 },
-        {   1,  22,  35,  88, 102, 141, 121, 116, 199, 153, 179 }
+        {   1, 152, 202, 238, 186, 188, 227, 178, 248, 205, 229 },
+        {  63, 125, 183, 234, 178, 184, 225, 179, 248, 205, 228 },
+        {  15, 100, 153, 227, 166, 180, 223, 173, 244, 198, 229 },
+        {   4,  76, 119, 210, 149, 170, 215, 165, 245, 200, 221 },
+        {   1,  46,  73, 165, 120, 154, 192, 144, 241, 189, 225 },
+        {   1,  18,  27,  78,  95, 136, 124, 110, 219, 158, 207 }
       }, { /* Coeff Band 4 */
-        {   1, 169, 220, 239, 196, 191, 220, 173, 242, 201, 226 },
-        {  64, 139, 195, 231, 183, 184, 215, 169, 240, 196, 211 },
-        {  12, 103, 153, 217, 162, 174, 212, 163, 236, 195, 211 },
-        {   3,  71, 109, 190, 141, 164, 202, 152, 240, 192, 220 },
-        {   1,  38,  61, 139, 114, 149, 175, 133, 233, 183, 211 },
-        {   1,  13,  22,  61,  93, 134, 101, 106, 194, 145, 185 }
+        {   1, 181, 211, 243, 197, 195, 228, 180, 249, 211, 252 },
+        {  40, 138, 189, 237, 184, 189, 226, 178, 249, 208, 247 },
+        {   7, 103, 153, 226, 166, 179, 223, 171, 249, 209, 224 },
+        {   1,  71, 110, 200, 143, 166, 213, 159, 249, 206, 241 },
+        {   1,  37,  60, 144, 111, 150, 189, 135, 245, 196, 232 },
+        {   1,  15,  25,  75,  91, 134, 128, 108, 224, 163, 213 }
       }, { /* Coeff Band 5 */
-        {   1, 204, 220, 234, 193, 185, 220, 166, 247, 207, 237 },
-        {  42, 139, 187, 221, 174, 177, 215, 161, 246, 201, 242 },
-        {   5,  83, 132, 204, 152, 168, 212, 158, 246, 203, 225 },
-        {   1,  48,  84, 175, 126, 157, 203, 148, 245, 199, 233 },
-        {   1,  24,  46, 123, 103, 142, 178, 128, 243, 189, 235 },
-        {   1,  10,  19,  58,  88, 134, 109, 101, 216, 151, 216 }
+        {   1, 215, 219, 246, 205, 197, 236, 183, 252, 221, 235 },
+        {  32, 146, 197, 239, 187, 188, 234, 180, 252, 223, 247 },
+        {   6, 100, 150, 227, 167, 178, 233, 178, 252, 219, 233 },
+        {   1,  63, 102, 203, 138, 167, 225, 162, 252, 216, 240 },
+        {   1,  33,  56, 148, 109, 146, 202, 138, 250, 208, 237 },
+        {   1,  15,  25,  75,  90, 131, 138, 108, 236, 171, 235 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 227,  36, 243, 237, 206, 186, 210, 157, 245, 195, 200 },
-        { 144,  41, 214, 226, 190, 182, 207, 155, 238, 193, 177 },
-        {  63,  37, 153, 199, 162, 169, 193, 145, 227, 187, 152 }
+        { 228,  37, 245, 229, 199, 183, 200, 146, 240, 188, 223 },
+        { 138,  62, 209, 217, 184, 177, 195, 148, 246, 186, 236 },
+        {  42,  79, 146, 185, 156, 167, 183, 137, 247, 189, 251 }
       }, { /* Coeff Band 1 */
-        {   1, 170, 247, 248, 213, 201, 239, 188, 238, 203, 255 },
-        { 214, 166, 242, 248, 212, 198, 236, 191, 221, 219, 199 },
-        { 139, 148, 224, 247, 207, 197, 236, 189, 249, 241, 128 },
-        { 102, 127, 195, 244, 190, 198, 235, 189, 239, 202, 228 },
-        {  76, 106, 154, 227, 159, 176, 234, 182, 243, 216, 229 },
-        {  52,  69,  93, 158, 125, 155, 173, 139, 225, 170, 209 }
+        {   1, 205, 242, 248, 210, 202, 245, 193, 233, 230, 255 },
+        { 191, 185, 234, 249, 210, 201, 245, 194, 255, 197, 128 },
+        { 112, 148, 214, 247, 208, 201, 246, 192, 255, 238, 128 },
+        {  76, 120, 182, 246, 190, 198, 246, 202, 255, 244, 128 },
+        {  51,  95, 145, 232, 156, 177, 246, 199, 255, 233, 128 },
+        {  47,  71, 104, 195, 129, 158, 230, 167, 253, 224, 255 }
       }, { /* Coeff Band 2 */
-        {   1, 139, 241, 245, 205, 193, 230, 177, 239, 198, 183 },
-        { 131, 139, 214, 240, 191, 189, 224, 181, 236, 203, 194 },
-        {  32, 102, 157, 228, 167, 177, 221, 174, 235, 191, 194 },
-        {  12,  75, 112, 201, 142, 163, 208, 161, 227, 180, 200 },
-        {   2,  45,  66, 142, 119, 154, 178, 141, 220, 171, 213 },
-        {   1,  15,  20,  56, 102, 151,  87, 104, 182, 136, 175 }
+        {   1, 182, 235, 247, 204, 195, 246, 202, 255, 227, 128 },
+        { 104, 145, 204, 243, 189, 191, 242, 199, 255, 229, 128 },
+        {  35, 107, 159, 234, 167, 181, 244, 188, 255, 221, 128 },
+        {  17,  87, 126, 216, 151, 168, 242, 179, 255, 242, 128 },
+        {   4,  68,  91, 182, 131, 154, 222, 153, 255, 228, 128 },
+        {   1,  55,  64, 126, 105, 137, 193, 121, 247, 194, 255 }
       }, { /* Coeff Band 3 */
-        {   1, 174, 243, 248, 212, 201, 237, 194, 249, 207, 255 },
-        { 134, 155, 223, 244, 200, 195, 230, 184, 248, 189, 233 },
-        {  26, 115, 177, 235, 180, 185, 225, 176, 245, 198, 255 },
-        {   8,  82, 129, 217, 156, 175, 220, 168, 243, 204, 228 },
-        {   3,  48,  75, 165, 122, 155, 193, 145, 245, 189, 199 },
-        {   1,  15,  27,  73, 101, 139, 117, 112, 212, 157, 209 }
+        {   1, 210, 239, 249, 209, 201, 249, 205, 255, 255, 128 },
+        {  91, 162, 218, 247, 200, 195, 250, 199, 255, 255, 128 },
+        {  16, 116, 173, 242, 184, 190, 251, 193, 255, 205, 128 },
+        {   5,  85, 133, 228, 156, 178, 244, 184, 255, 251, 128 },
+        {   1,  55,  83, 196, 125, 164, 236, 168, 249, 249, 255 },
+        {   1,  24,  39, 127,  92, 154, 183, 133, 255, 192, 128 }
       }, { /* Coeff Band 4 */
-        {   1, 191, 244, 248, 214, 200, 229, 185, 249, 207, 255 },
-        { 106, 167, 221, 242, 198, 192, 223, 178, 245, 202, 246 },
-        {  13, 117, 169, 229, 175, 182, 220, 170, 244, 202, 226 },
-        {   2,  74, 114, 203, 143, 170, 211, 160, 248, 199, 232 },
-        {   1,  35,  58, 141, 111, 144, 184, 132, 244, 196, 239 },
-        {   1,  12,  22,  66,  91, 138, 114, 102, 225, 156, 214 }
+        {   1, 225, 242, 252, 218, 205, 251, 207, 255, 255, 128 },
+        {  67, 174, 223, 249, 205, 199, 250, 210, 255, 234, 128 },
+        {  10, 119, 177, 243, 186, 187, 253, 199, 255, 255, 128 },
+        {   2,  81, 129, 228, 154, 177, 244, 193, 255, 251, 128 },
+        {   1,  48,  78, 193, 122, 152, 240, 171, 255, 240, 128 },
+        {   1,  19,  43, 116,  96, 128, 195, 135, 255, 234, 128 }
       }, { /* Coeff Band 5 */
-        {   1, 220, 231, 246, 203, 196, 239, 188, 255, 212, 255 },
-        {  42, 155, 203, 241, 189, 191, 235, 184, 253, 220, 255 },
-        {   4,  95, 151, 230, 167, 182, 234, 178, 252, 217, 243 },
-        {   1,  61, 105, 206, 140, 168, 226, 167, 250, 215, 242 },
-        {   1,  31,  60, 151, 109, 148, 204, 142, 250, 208, 230 },
-        {   1,  13,  26,  76,  93, 132, 139, 106, 236, 171, 237 }
+        {   1, 237, 210, 255, 213, 219, 255, 235, 255, 219, 128 },
+        {  49, 163, 203, 252, 182, 198, 255, 235, 255, 255, 128 },
+        {  23, 114, 156, 247, 196, 187, 255, 238, 255, 255, 128 },
+        {   6,  71, 124, 248, 163, 202, 253, 203, 255, 255, 128 },
+        {   1,  35,  74, 226, 160, 162, 246, 189, 255, 244, 128 },
+        {   1,  16,  19, 136,  92, 164, 237, 108, 255, 255, 128 }
       }
     }
   }
 };
-static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES_32X32] = {
+static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES] = {
   { /* block Type 0 */
     { /* Intra */
       { /* Coeff Band 0 */
@@ -607,6 +607,90 @@
         {   1,  61,  99, 193, 137, 164, 207, 155, 239, 197, 208 },
         {   1,  28,  49, 128, 105, 145, 177, 130, 234, 185, 206 },
         {   1,   9,  16,  48,  89, 134,  89,  99, 183, 140, 169 }
+      }
+    }
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 176,  22, 201, 227, 185, 189, 160, 172, 115, 141, 105 },
+        {  64,  33, 120, 195, 149, 171, 170, 150, 182, 175, 139 },
+        {  12,  33,  68, 151, 118, 153, 172, 138, 202, 175, 153 }
+      }, { /* Coeff Band 1 */
+        {   1, 125, 175, 228, 163, 176, 215, 171, 226, 193, 165 },
+        { 127, 126, 174, 224, 163, 177, 212, 167, 225, 175, 235 },
+        {  57, 114, 159, 223, 166, 175, 216, 167, 234, 182, 211 },
+        {  23,  93, 133, 215, 150, 174, 216, 171, 233, 174, 176 },
+        {   4,  56,  84, 178, 127, 157, 209, 149, 233, 197, 194 },
+        {   1,  19,  26,  70,  93, 136, 114, 108, 193, 150, 167 }
+      }, { /* Coeff Band 2 */
+        {   1,  76, 172, 217, 161, 172, 216, 165, 240, 188, 226 },
+        {  41,  73, 136, 208, 152, 168, 214, 163, 233, 189, 248 },
+        {  14,  59, 102, 195, 137, 163, 209, 158, 227, 184, 204 },
+        {   4,  45,  75, 168, 122, 153, 197, 148, 231, 193, 178 },
+        {   1,  33,  48, 118, 106, 148, 154, 126, 221, 168, 211 },
+        {   1,  12,  16,  42,  90, 143,  61,  94, 159, 122, 167 }
+      }, { /* Coeff Band 3 */
+        {   1, 134, 186, 226, 173, 180, 208, 172, 220, 179, 205 },
+        {  60, 114, 164, 219, 166, 177, 207, 166, 231, 176, 208 },
+        {  18,  90, 134, 208, 152, 175, 200, 164, 225, 181, 199 },
+        {   7,  67, 102, 189, 139, 164, 192, 155, 225, 172, 209 },
+        {   1,  39,  59, 137, 116, 151, 160, 132, 222, 166, 212 },
+        {   1,  12,  17,  50,  93, 134,  82, 102, 181, 131, 190 }
+      }, { /* Coeff Band 4 */
+        {   1, 160, 195, 229, 180, 185, 204, 163, 243, 185, 223 },
+        {  31, 124, 170, 221, 170, 179, 201, 164, 240, 183, 223 },
+        {   5,  91, 134, 204, 154, 170, 191, 155, 236, 178, 232 },
+        {   1,  62,  95, 173, 135, 159, 180, 145, 234, 179, 225 },
+        {   1,  30,  48, 116, 109, 147, 152, 123, 231, 170, 224 },
+        {   1,  11,  17,  53,  90, 133,  93, 102, 201, 139, 202 }
+      }, { /* Coeff Band 5 */
+        {   1, 215, 203, 233, 186, 183, 226, 170, 249, 213, 225 },
+        {  13, 133, 175, 224, 170, 178, 224, 167, 250, 212, 235 },
+        {   1,  83, 127, 209, 151, 169, 221, 162, 251, 212, 243 },
+        {   1,  53,  85, 182, 127, 157, 213, 153, 250, 210, 234 },
+        {   1,  30,  47, 131, 103, 143, 190, 132, 248, 200, 240 },
+        {   1,  14,  21,  67,  89, 129, 126, 104, 232, 167, 223 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 193,  35, 239, 239, 206, 194, 195, 152, 244, 200, 255 },
+        {  77,  57, 198, 224, 192, 187, 181, 145, 242, 190, 248 },
+        {  21,  54, 149, 197, 172, 171, 169, 138, 238, 178, 241 }
+      }, { /* Coeff Band 1 */
+        {   1, 227, 241, 247, 195, 195, 245, 199, 255, 255, 128 },
+        { 189, 223, 237, 249, 199, 200, 238, 198, 255, 255, 128 },
+        { 125, 204, 226, 247, 198, 199, 251, 213, 255, 255, 128 },
+        { 101, 167, 207, 246, 193, 201, 245, 168, 255, 255, 128 },
+        {  89, 121, 174, 237, 169, 184, 246, 204, 255, 255, 128 },
+        {  71,  79, 135, 216, 149, 170, 234, 168, 255, 226, 128 }
+      }, { /* Coeff Band 2 */
+        {   1, 207, 235, 250, 220, 204, 250, 201, 255, 255, 128 },
+        { 103, 160, 210, 245, 195, 188, 249, 195, 255, 255, 128 },
+        {  33, 130, 165, 234, 168, 183, 253, 199, 255, 255, 128 },
+        {  10, 113, 138, 223, 146, 180, 248, 199, 255, 255, 128 },
+        {   1,  88, 104, 172, 112, 174, 221, 126, 255, 217, 128 },
+        {   1,  87,  70, 160,  68, 140, 171,  85, 255,  85, 128 }
+      }, { /* Coeff Band 3 */
+        {   1, 230, 240, 249, 209, 200, 243, 199, 255, 228, 128 },
+        {  60, 178, 218, 247, 203, 200, 247, 198, 255, 255, 128 },
+        {   8, 119, 162, 241, 188, 185, 252, 202, 255, 255, 128 },
+        {   2,  78, 119, 218, 149, 162, 247, 184, 255, 255, 128 },
+        {   1,  48,  81, 172, 142, 148, 239, 140, 255, 239, 128 },
+        {   1,  29,  23,  82,  96, 102, 181, 149, 255, 255, 128 }
+      }, { /* Coeff Band 4 */
+        {   1, 240, 241, 250, 216, 203, 248, 188, 255, 255, 128 },
+        {  60, 180, 222, 247, 202, 195, 247, 191, 255, 255, 128 },
+        {   9, 120, 169, 240, 190, 189, 249, 181, 255, 255, 128 },
+        {   2,  85, 126, 223, 154, 178, 240, 184, 255, 255, 128 },
+        {   1,  47,  90, 198, 132, 158, 233, 162, 255, 224, 128 },
+        {   1,  33,  34, 143, 116, 156, 217, 128, 255, 255, 128 }
+      }, { /* Coeff Band 5 */
+        {   1, 250, 193, 249, 188, 193, 255, 236, 255, 255, 128 },
+        {  35, 187, 185, 247, 154, 184, 255, 247, 255, 171, 128 },
+        {  20, 132, 114, 223, 172, 165, 255, 229, 255, 255, 128 },
+        {   4,  97,  96, 218,  96, 162, 255, 164, 255, 253, 128 },
+        {   1,  57,  35, 197, 154, 173, 254, 215, 255, 255, 128 },
+        {   1,   8,   2, 161,  10,  57, 230, 228, 255, 171, 128 }
       }
     }
   }
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -336,6 +336,6 @@
                     BLOCK_TYPES, cm->fc.coef_counts_16x16,
                     count_sat, update_factor);
   update_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
-                    BLOCK_TYPES_32X32, cm->fc.coef_counts_32x32,
+                    BLOCK_TYPES, cm->fc.coef_counts_32x32,
                     count_sat, update_factor);
 }
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -61,7 +61,6 @@
 
 /* Outside dimension.  0 = Y with DC, 1 = UV */
 #define BLOCK_TYPES 2
-#define BLOCK_TYPES_32X32 1
 #define REF_TYPES 2  // intra=0, inter=1
 
 /* Middle dimension reflects the coefficient position within the transform. */
@@ -110,10 +109,22 @@
 void vp9_coef_tree_initialize(void);
 void vp9_adapt_coef_probs(struct VP9Common *);
 
-static void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
+static INLINE void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
   /* Clear entropy contexts */
   vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
   vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+}
+
+static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd) {
+  /* Clear entropy contexts */
+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+}
+
+static INLINE void vp9_reset_sb64_tokens_context(MACROBLOCKD* const xd) {
+  /* Clear entropy contexts */
+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
 }
 
 extern const int vp9_coef_bands[32];
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -645,7 +645,7 @@
   // First transform rows
   for (i = 0; i < 16; ++i) {
     idct16_1d(input, outptr);
-    input += half_pitch;
+    input += 16;
     outptr += 16;
   }
 
@@ -655,7 +655,7 @@
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * 16 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
   }
 }
 
@@ -838,7 +838,7 @@
 };
 
 void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int input_pitch, TX_TYPE tx_type) {
+                          int pitch, TX_TYPE tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -848,7 +848,7 @@
   // Rows
   for (i = 0; i < 16; ++i) {
     ht.rows(input, outptr);
-    input += input_pitch;
+    input += 16;
     outptr += 16;
   }
 
@@ -858,7 +858,7 @@
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * 16 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
   }
 }
 
@@ -875,7 +875,7 @@
     vpx_memset(out, 0, sizeof(out));
     for (i = 0; i < 4; ++i) {
       idct16_1d(input, outptr);
-      input += half_pitch;
+      input += 16;
       outptr += 16;
     }
 
@@ -885,7 +885,7 @@
         temp_in[j] = out[j*16 + i];
       idct16_1d(temp_in, temp_out);
       for (j = 0; j < 16; ++j)
-        output[j*16 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
     }
 }
 
@@ -1273,7 +1273,7 @@
   // Rows
   for (i = 0; i < 32; ++i) {
     idct32_1d(input, outptr);
-    input += half_pitch;
+    input += 32;
     outptr += 32;
   }
 
@@ -1283,7 +1283,7 @@
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
   }
 }
 
@@ -1306,7 +1306,7 @@
   vpx_memset(out, 0, sizeof(out));
   for (i = 0; i < 4; ++i) {
     idct32_1d(input, outptr);
-    input += half_pitch;
+    input += 32;
     outptr += 32;
   }
 
@@ -1316,6 +1316,6 @@
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
   }
 }
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -11,12 +11,13 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) {
-  BLOCKD *b = &xd->block[block];
-  if (xd->eobs[block] <= 1)
-    xd->inv_txm4x4_1(b->dqcoeff, b->diff, pitch);
+void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
+                                 int16_t *dqcoeff, int16_t *diff,
+                                 int pitch) {
+  if (eob <= 1)
+    xd->inv_txm4x4_1(dqcoeff, diff, pitch);
   else
-    xd->inv_txm4x4(b->dqcoeff, b->diff, pitch);
+    xd->inv_txm4x4(dqcoeff, diff, pitch);
 }
 
 void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
@@ -27,7 +28,8 @@
     if (tx_type != DCT_DCT) {
       vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
-      vp9_inverse_transform_b_4x4(xd, i, 32);
+      vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,
+                                  xd->block[i].diff, 32);
     }
   }
 }
@@ -36,7 +38,8 @@
   int i;
 
   for (i = 16; i < 24; i++) {
-    vp9_inverse_transform_b_4x4(xd, i, 16);
+    vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,
+                                xd->block[i].diff, 16);
   }
 }
 
@@ -111,13 +114,170 @@
   vp9_inverse_transform_mbuv_8x8(xd);
 }
 
-void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) {
-  vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64);
+void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd) {
+  vp9_short_idct32x32(xd->dqcoeff, xd->diff, 64);
 }
 
-void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) {
-  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024,
-                                xd_sb->diff + 1024, 32);
-  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280,
-                                xd_sb->diff + 1280, 32);
+void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
+                                  xd->diff + x_idx * 16 + y_idx * 32 * 16, 64);
+  }
+}
+
+void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
+                                xd->diff + x_idx * 8 + y_idx * 32 * 8, 64);
+  }
+}
+
+void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3;
+
+    vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
+                                xd->diff + x_idx * 4 + y_idx * 4 * 32, 64);
+  }
+}
+
+void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd) {
+  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1024,
+                                xd->diff + 1024, 32);
+  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1280,
+                                xd->diff + 1280, 32);
+}
+
+void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1024 + n * 64,
+                                xd->diff + 1024 + x_idx * 8 + y_idx * 16 * 8,
+                                32);
+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1280 + n * 64,
+                                xd->diff + 1280 + x_idx * 8 + y_idx * 16 * 8,
+                                32);
+  }
+}
+
+void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + n],
+                                xd->dqcoeff + 1024 + n * 16,
+                                xd->diff + 1024 + x_idx * 4 + y_idx * 16 * 4,
+                                32);
+    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + 16 + n],
+                                xd->dqcoeff + 1280 + n * 16,
+                                xd->diff + 1280 + x_idx * 4 + y_idx * 16 * 4,
+                                32);
+  }
+}
+
+void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_short_idct32x32(xd->dqcoeff + n * 1024,
+                        xd->diff + x_idx * 32 + y_idx * 32 * 64, 128);
+  }
+}
+
+void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
+                                  xd->diff + x_idx * 16 + y_idx * 64 * 16, 128);
+  }
+}
+
+void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3;
+
+    vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
+                                xd->diff + x_idx * 8 + y_idx * 64 * 8, 128);
+  }
+}
+
+void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 256; n++) {
+    const int x_idx = n & 15, y_idx = n >> 4;
+
+    vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
+                                xd->diff + x_idx * 4 + y_idx * 4 * 64, 128);
+  }
+}
+
+void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd) {
+  vp9_short_idct32x32(xd->dqcoeff + 4096,
+                      xd->diff + 4096, 64);
+  vp9_short_idct32x32(xd->dqcoeff + 4096 + 1024,
+                      xd->diff + 4096 + 1024, 64);
+}
+
+void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1, off = x_idx * 16 + y_idx * 32 * 16;
+
+    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + n * 256,
+                                  xd->diff + 4096 + off, 64);
+    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + 1024 + n * 256,
+                                  xd->diff + 4096 + 1024 + off, 64);
+  }
+}
+
+void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2, off = x_idx * 8 + y_idx * 32 * 8;
+
+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + n * 64,
+                                xd->diff + 4096 + off, 64);
+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + 1024 + n * 64,
+                                xd->diff + 4096 + 1024 + off, 64);
+  }
+}
+
+void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd) {
+  int n;
+
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3, off = x_idx * 4 + y_idx * 32 * 4;
+
+    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + n],
+                                xd->dqcoeff + 4096 + n * 16,
+                                xd->diff + 4096 + off, 64);
+    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + 64 + n],
+                                xd->dqcoeff + 4096 + 1024 + n * 16,
+                                xd->diff + 4096 + 1024 + off, 64);
+  }
 }
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -15,7 +15,9 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);
+void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
+                                 int16_t *dqcoeff, int16_t *diff,
+                                 int pitch);
 
 void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);
 
@@ -39,7 +41,21 @@
 
 void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);
 
-void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);
-void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);
+void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd);
+void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd);
+void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd);
+void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd);
+void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd);
+void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd);
+void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd);
+
+void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd);
+void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd);
+void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd);
+void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd);
+void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd);
+void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd);
+void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd);
+void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd);
 
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -61,7 +61,7 @@
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];
+  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
 
   nmv_context nmvc;
   nmv_context pre_nmvc;
@@ -83,12 +83,12 @@
   vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES_32X32];
+  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
   vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
   vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];
+  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
 
   nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -117,7 +117,7 @@
 
 void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
   int x, y, stride = xd->block[0].dst_stride;
-  int16_t *diff = xd->sb_coeff_data.diff;
+  int16_t *diff = xd->diff;
 
   for (y = 0; y < 32; y++) {
     for (x = 0; x < 32; x++) {
@@ -130,8 +130,8 @@
 
 void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
   int x, y, stride = xd->block[16].dst_stride;
-  int16_t *udiff = xd->sb_coeff_data.diff + 1024;
-  int16_t *vdiff = xd->sb_coeff_data.diff + 1280;
+  int16_t *udiff = xd->diff + 1024;
+  int16_t *vdiff = xd->diff + 1280;
 
   for (y = 0; y < 16; y++) {
     for (x = 0; x < 16; x++) {
@@ -142,6 +142,36 @@
     vdst += stride;
     udiff += 16;
     vdiff += 16;
+  }
+}
+
+void vp9_recon_sb64y_s_c(MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y, stride = xd->block[0].dst_stride;
+  int16_t *diff = xd->diff;
+
+  for (y = 0; y < 64; y++) {
+    for (x = 0; x < 64; x++) {
+      dst[x] = clip_pixel(dst[x] + diff[x]);
+    }
+    dst += stride;
+    diff += 64;
+  }
+}
+
+void vp9_recon_sb64uv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, stride = xd->block[16].dst_stride;
+  int16_t *udiff = xd->diff + 4096;
+  int16_t *vdiff = xd->diff + 4096 + 1024;
+
+  for (y = 0; y < 32; y++) {
+    for (x = 0; x < 32; x++) {
+      udst[x] = clip_pixel(udst[x] + udiff[x]);
+      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);
+    }
+    udst += stride;
+    vdst += stride;
+    udiff += 32;
+    vdiff += 32;
   }
 }
 
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -97,6 +97,12 @@
 prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
 specialize void vp9_recon_sbuv_s
 
+prototype void vp9_recon_sb64y_s "struct macroblockd *x, uint8_t *dst"
+specialize vp9_recon_sb64y_s
+
+prototype void vp9_recon_sb64uv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
+specialize void vp9_recon_sb64uv_s
+
 prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mby_s
 
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -452,125 +452,12 @@
   }
 }
 
-static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                            BOOL_DECODER* const bc, int n,
-                            int maska, int shiftb) {
-  int x_idx = n & maska, y_idx = n >> shiftb;
-  TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
-  if (tx_type != DCT_DCT) {
-    vp9_ht_dequant_idct_add_16x16_c(
-        tx_type, xd->qcoeff, xd->block[0].dequant,
-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd->dst.y_stride, xd->eobs[0]);
-  } else {
-    vp9_dequant_idct_add_16x16(
-        xd->qcoeff, xd->block[0].dequant,
-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd->dst.y_stride, xd->eobs[0]);
-  }
-  vp9_dequant_idct_add_uv_block_8x8_inplace_c(
-      xd->qcoeff + 16 * 16,
-      xd->block[16].dequant,
-      xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.uv_stride, xd);
-};
-
-static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n,
-                          int maska, int shiftb) {
-  int x_idx = n & maska, y_idx = n >> shiftb;
-  TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);
-  if (tx_type != DCT_DCT) {
-    int i;
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      int idx = (ib & 0x02) ? (ib + 2) : ib;
-      int16_t *q  = xd->block[idx].qcoeff;
-      int16_t *dq = xd->block[0].dequant;
-      int stride = xd->dst.y_stride;
-      tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
-      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_8x8_c(
-            tx_type, q, dq,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
-            + x_idx * 16 + (i & 1) * 8,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
-            + x_idx * 16 + (i & 1) * 8,
-            stride, stride, xd->eobs[idx]);
-      } else {
-        vp9_dequant_idct_add_8x8_c(
-            q, dq,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
-            + x_idx * 16 + (i & 1) * 8,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride
-            + x_idx * 16 + (i & 1) * 8,
-            stride, stride, xd->eobs[idx]);
-      }
-    }
-  } else {
-    vp9_dequant_idct_add_y_block_8x8_inplace_c(
-        xd->qcoeff, xd->block[0].dequant,
-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd);
-  }
-  vp9_dequant_idct_add_uv_block_8x8_inplace_c(
-      xd->qcoeff + 16 * 16, xd->block[16].dequant,
-      xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.uv_stride, xd);
-};
-
-static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n,
-                          int maska, int shiftb) {
-  int x_idx = n & maska, y_idx = n >> shiftb;
-  TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[0]);
-  if (tx_type != DCT_DCT) {
-    int i;
-    for (i = 0; i < 16; i++) {
-      BLOCKD *b = &xd->block[i];
-      tx_type = get_tx_type_4x4(xd, b);
-      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_c(
-            tx_type, b->qcoeff, b->dequant,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
-            + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
-            + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_stride, xd->dst.y_stride, xd->eobs[i]);
-      } else {
-        xd->itxm_add(
-            b->qcoeff, b->dequant,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
-            + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride
-            + x_idx * 16 + (i & 3) * 4,
-            xd->dst.y_stride, xd->dst.y_stride, xd->eobs[i]);
-      }
-    }
-  } else {
-    vp9_dequant_idct_add_y_block_4x4_inplace_c(
-        xd->qcoeff, xd->block[0].dequant,
-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
-        xd->dst.y_stride, xd);
-  }
-  vp9_dequant_idct_add_uv_block_4x4_inplace_c(
-      xd->qcoeff + 16 * 16, xd->block[16].dequant,
-      xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
-      xd->dst.uv_stride, xd);
-};
-
 static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                 int mb_row, int mb_col,
                                 BOOL_DECODER* const bc) {
   int n, eobtotal;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
   VP9_COMMON *const pc = &pbi->common;
-  MODE_INFO *orig_mi = xd->mode_info_context;
+  MODE_INFO *mi = xd->mode_info_context;
   const int mis = pc->mode_info_stride;
 
   assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64);
@@ -583,21 +470,8 @@
     mb_init_dequantizer(pbi, xd);
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    int n;
+    vp9_reset_sb64_tokens_context(xd);
 
-    vp9_reset_mb_tokens_context(xd);
-    for (n = 1; n <= 3; n++) {
-      if (mb_col < pc->mb_cols - n)
-        xd->above_context += n;
-      if (mb_row < pc->mb_rows - n)
-        xd->left_context += n;
-      vp9_reset_mb_tokens_context(xd);
-      if (mb_col < pc->mb_cols - n)
-        xd->above_context -= n;
-      if (mb_row < pc->mb_rows - n)
-        xd->left_context -= n;
-    }
-
     /* Special case:  Force the loopfilter to skip when eobtotal and
      * mb_skip_coeff are zero.
      */
@@ -617,74 +491,101 @@
   }
 
   /* dequantization and idct */
-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
-    for (n = 0; n < 4; n++) {
-      const int x_idx = n & 1, y_idx = n >> 1;
+  eobtotal = vp9_decode_sb64_tokens(pbi, xd, bc);
+  if (eobtotal == 0) {  // skip loopfilter
+    for (n = 0; n < 16; n++) {
+      const int x_idx = n & 3, y_idx = n >> 2;
 
-      if (mb_col + x_idx * 2 >= pc->mb_cols ||
-          mb_row + y_idx * 2 >= pc->mb_rows)
-        continue;
-
-      xd->left_context = pc->left_context + (y_idx << 1);
-      xd->above_context = pc->above_context + mb_col + (x_idx << 1);
-      xd->mode_info_context = orig_mi + x_idx * 2 + y_idx * 2 * mis;
-      eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
-      if (eobtotal == 0) {  // skip loopfilter
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-        if (mb_col + 1 < pc->mb_cols)
-          xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
-        if (mb_row + 1 < pc->mb_rows) {
-          xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
-          if (mb_col + 1 < pc->mb_cols)
-            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+      if (mb_col + x_idx < pc->mb_cols && mb_row + y_idx < pc->mb_rows)
+        mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+    }
+  } else {
+    switch (xd->mode_info_context->mbmi.txfm_size) {
+      case TX_32X32:
+        for (n = 0; n < 4; n++) {
+          const int x_idx = n & 1, y_idx = n >> 1;
+          vp9_dequant_idct_add_32x32(xd->qcoeff + n * 1024,
+              xd->block[0].dequant,
+              xd->dst.y_buffer + x_idx * 32 + y_idx * xd->dst.y_stride * 32,
+              xd->dst.y_buffer + x_idx * 32 + y_idx * xd->dst.y_stride * 32,
+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 64]);
         }
-      } else {
-        vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
-                                   xd->dst.y_buffer + x_idx * 32 +
-                                       xd->dst.y_stride * y_idx * 32,
-                                   xd->dst.y_buffer + x_idx * 32 +
-                                       xd->dst.y_stride * y_idx * 32,
-                                   xd->dst.y_stride, xd->dst.y_stride,
-                                   xd->eobs[0]);
-        vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
-                                              xd->block[16].dequant,
-                                              xd->dst.u_buffer + x_idx * 16 +
-                                                xd->dst.uv_stride * y_idx * 16,
-                                              xd->dst.v_buffer + x_idx * 16 +
-                                                xd->dst.uv_stride * y_idx * 16,
-                                              xd->dst.uv_stride, xd);
-      }
+        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096,
+            xd->block[16].dequant, xd->dst.u_buffer, xd->dst.u_buffer,
+            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256]);
+        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096 + 1024,
+            xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer,
+            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320]);
+        break;
+      case TX_16X16:  // FIXME(rbultje): adst
+        for (n = 0; n < 16; n++) {
+          const int x_idx = n & 3, y_idx = n >> 2;
+          vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256, xd->block[0].dequant,
+              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+        }
+        for (n = 0; n < 4; n++) {
+          const int x_idx = n & 1, y_idx = n >> 1;
+          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + n * 256,
+              xd->block[16].dequant,
+              xd->dst.u_buffer + y_idx * 16 * xd->dst.uv_stride + x_idx * 16,
+              xd->dst.u_buffer + y_idx * 16 * xd->dst.uv_stride + x_idx * 16,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 16]);
+          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + 1024 + n * 256,
+              xd->block[20].dequant,
+              xd->dst.v_buffer + y_idx * 16 * xd->dst.uv_stride + x_idx * 16,
+              xd->dst.v_buffer + y_idx * 16 * xd->dst.uv_stride + x_idx * 16,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 16]);
+        }
+        break;
+      case TX_8X8:  // FIXME(rbultje): adst
+        for (n = 0; n < 64; n++) {
+          const int x_idx = n & 7, y_idx = n >> 3;
+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, xd->block[0].dequant,
+              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+        }
+        for (n = 0; n < 16; n++) {
+          const int x_idx = n & 3, y_idx = n >> 2;
+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096,
+              xd->block[16].dequant,
+              xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 4]);
+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096 + 1024,
+              xd->block[20].dequant,
+              xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 4]);
+        }
+        break;
+      case TX_4X4:  // FIXME(rbultje): adst
+        for (n = 0; n < 256; n++) {
+          const int x_idx = n & 15, y_idx = n >> 4;
+          xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
+              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+        }
+        for (n = 0; n < 64; n++) {
+          const int x_idx = n & 7, y_idx = n >> 3;
+          xd->itxm_add(xd->qcoeff + 4096 + n * 16,
+              xd->block[16].dequant,
+              xd->dst.u_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.u_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n]);
+          xd->itxm_add(xd->qcoeff + 4096 + 1024 + n * 16,
+              xd->block[20].dequant,
+              xd->dst.v_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.v_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n]);
+        }
+        break;
+      default: assert(0);
     }
-  } else {
-    for (n = 0; n < 16; n++) {
-      int x_idx = n & 3, y_idx = n >> 2;
-
-      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
-        continue;
-
-      xd->above_context = pc->above_context + mb_col + x_idx;
-      xd->left_context = pc->left_context + y_idx;
-      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
-
-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
-      if (eobtotal == 0) {  // skip loopfilter
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-        continue;
-      }
-
-      if (tx_size == TX_16X16) {
-        decode_16x16_sb(pbi, xd, bc, n, 3, 2);
-      } else if (tx_size == TX_8X8) {
-        decode_8x8_sb(pbi, xd, bc, n, 3, 2);
-      } else {
-        decode_4x4_sb(pbi, xd, bc, n, 3, 2);
-      }
-    }
   }
-
-  xd->above_context = pc->above_context + mb_col;
-  xd->left_context = pc->left_context;
-  xd->mode_info_context = orig_mi;
 }
 
 static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
@@ -691,9 +592,7 @@
                                 int mb_row, int mb_col,
                                 BOOL_DECODER* const bc) {
   int n, eobtotal;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
   VP9_COMMON *const pc = &pbi->common;
-  MODE_INFO *orig_mi = xd->mode_info_context;
   const int mis = pc->mode_info_stride;
 
   assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32);
@@ -706,16 +605,7 @@
     mb_init_dequantizer(pbi, xd);
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    vp9_reset_mb_tokens_context(xd);
-    if (mb_col < pc->mb_cols - 1)
-      xd->above_context++;
-    if (mb_row < pc->mb_rows - 1)
-      xd->left_context++;
-    vp9_reset_mb_tokens_context(xd);
-    if (mb_col < pc->mb_cols - 1)
-      xd->above_context--;
-    if (mb_row < pc->mb_rows - 1)
-      xd->left_context--;
+    vp9_reset_sb_tokens_context(xd);
 
     /* Special case:  Force the loopfilter to skip when eobtotal and
      * mb_skip_coeff are zero.
@@ -736,56 +626,90 @@
   }
 
   /* dequantization and idct */
-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
-    eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
-    if (eobtotal == 0) {  // skip loopfilter
-      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+  eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
+  if (eobtotal == 0) {  // skip loopfilter
+    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+    if (mb_col + 1 < pc->mb_cols)
+      xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+    if (mb_row + 1 < pc->mb_rows) {
+      xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
       if (mb_col + 1 < pc->mb_cols)
-        xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
-      if (mb_row + 1 < pc->mb_rows) {
-        xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
-        if (mb_col + 1 < pc->mb_cols)
-          xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
-      }
-    } else {
-      vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
-                                 xd->dst.y_buffer, xd->dst.y_buffer,
-                                 xd->dst.y_stride, xd->dst.y_stride,
-                                 xd->eobs[0]);
-      vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
-                                            xd->block[16].dequant,
-                                            xd->dst.u_buffer, xd->dst.v_buffer,
-                                            xd->dst.uv_stride, xd);
+        xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
     }
   } else {
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-
-      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
-        continue;
-
-      xd->above_context = pc->above_context + mb_col + x_idx;
-      xd->left_context = pc->left_context + y_idx + (mb_row & 2);
-      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
-
-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
-      if (eobtotal == 0) {  // skip loopfilter
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-        continue;
-      }
-
-      if (tx_size == TX_16X16) {
-        decode_16x16_sb(pbi, xd, bc, n, 1, 1);
-      } else if (tx_size == TX_8X8) {
-        decode_8x8_sb(pbi, xd, bc, n, 1, 1);
-      } else {
-        decode_4x4_sb(pbi, xd, bc, n, 1, 1);
-      }
+    switch (xd->mode_info_context->mbmi.txfm_size) {
+      case TX_32X32:
+        vp9_dequant_idct_add_32x32(xd->qcoeff, xd->block[0].dequant,
+                                   xd->dst.y_buffer, xd->dst.y_buffer,
+                                   xd->dst.y_stride, xd->dst.y_stride,
+                                   xd->eobs[0]);
+        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,
+                                              xd->block[16].dequant,
+                                              xd->dst.u_buffer,
+                                              xd->dst.v_buffer,
+                                              xd->dst.uv_stride, xd);
+        break;
+      case TX_16X16:  // FIXME(rbultje): adst
+        for (n = 0; n < 4; n++) {
+          const int x_idx = n & 1, y_idx = n >> 1;
+          vp9_dequant_idct_add_16x16(
+              xd->qcoeff + n * 256, xd->block[0].dequant,
+              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+              xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
+        }
+        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,
+                                              xd->block[16].dequant,
+                                              xd->dst.u_buffer,
+                                              xd->dst.v_buffer,
+                                              xd->dst.uv_stride, xd);
+        break;
+      case TX_8X8:  // FIXME(rbultje): adst
+        for (n = 0; n < 16; n++) {
+          const int x_idx = n & 3, y_idx = n >> 2;
+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, xd->block[0].dequant,
+              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+              xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8,
+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
+        }
+        for (n = 0; n < 4; n++) {
+          const int x_idx = n & 1, y_idx = n >> 1;
+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1024,
+              xd->block[16].dequant,
+              xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n * 4]);
+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1280,
+              xd->block[20].dequant,
+              xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n * 4]);
+        }
+        break;
+      case TX_4X4:  // FIXME(rbultje): adst
+        for (n = 0; n < 64; n++) {
+          const int x_idx = n & 7, y_idx = n >> 3;
+          xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
+              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+              xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4,
+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
+        }
+        for (n = 0; n < 16; n++) {
+          const int x_idx = n & 3, y_idx = n >> 2;
+          xd->itxm_add(xd->qcoeff + 1024 + n * 16,
+              xd->block[16].dequant,
+              xd->dst.u_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.u_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n]);
+          xd->itxm_add(xd->qcoeff + 1280 + n * 16,
+              xd->block[20].dequant,
+              xd->dst.v_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.v_buffer + y_idx * 4 * xd->dst.uv_stride + x_idx * 4,
+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n]);
+        }
+        break;
+      default: assert(0);
     }
-
-    xd->above_context = pc->above_context + mb_col;
-    xd->left_context = pc->left_context + (mb_row & 2);
-    xd->mode_info_context = orig_mi;
   }
 }
 
@@ -1187,7 +1111,7 @@
     read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES);
   }
   if (pbi->common.txfm_mode > ALLOW_16X16) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES_32X32);
+    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES);
   }
 }
 
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -354,7 +354,7 @@
                                            int stride,
                                            MACROBLOCKD *xd) {
   vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride,
-                               xd->eobs[16]);
+                               xd->eobs[64]);
   vp9_dequant_idct_add_16x16_c(q + 256, dq, dstv, dstv, stride, stride,
-                               xd->eobs[20]);
+                               xd->eobs[80]);
 }
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -90,9 +90,8 @@
                         const int *const scan, TX_SIZE txfm_size) {
   ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context;
   ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context;
-  const int aidx = vp9_block2above[txfm_size][block_idx];
-  const int lidx = vp9_block2left[txfm_size][block_idx];
-  ENTROPY_CONTEXT above_ec = A0[aidx] != 0, left_ec = L0[lidx] != 0;
+  int aidx, lidx;
+  ENTROPY_CONTEXT above_ec, left_ec;
   FRAME_CONTEXT *const fc = &dx->common.fc;
   int recent_energy = 0;
   int pt, c = 0;
@@ -101,9 +100,22 @@
   vp9_coeff_count *coef_counts;
   const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
 
+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+    aidx = vp9_block2above_sb64[txfm_size][block_idx];
+    lidx = vp9_block2left_sb64[txfm_size][block_idx];
+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
+    aidx = vp9_block2above_sb[txfm_size][block_idx];
+    lidx = vp9_block2left_sb[txfm_size][block_idx];
+  } else {
+    aidx = vp9_block2above[txfm_size][block_idx];
+    lidx = vp9_block2left[txfm_size][block_idx];
+  }
+
   switch (txfm_size) {
     default:
     case TX_4X4:
+      above_ec = A0[aidx] != 0;
+      left_ec = L0[lidx] != 0;
       coef_probs  = fc->coef_probs_4x4;
       coef_counts = fc->coef_counts_4x4;
       break;
@@ -240,7 +252,7 @@
       if (type == PLANE_TYPE_UV) {
         ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
         ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
-        A1[aidx] = A1[aidx + 1] = L1[aidx] = L1[lidx + 1] = A0[aidx];
+        A1[aidx] = A1[aidx + 1] = L1[lidx] = L1[lidx + 1] = A0[aidx];
         if (txfm_size >= TX_32X32) {
           ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);
           ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);
@@ -272,24 +284,181 @@
                          MACROBLOCKD* const xd,
                          BOOL_DECODER* const bc) {
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int i, eobtotal = 0, seg_eob;
+  int i, eobtotal = 0, seg_eob, c;
 
-  // Luma block
-  int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_32X32:
+      // Luma block
+      c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
                        DCT_DCT, get_eob(xd, segment_id, 1024),
-                       xd->sb_coeff_data.qcoeff,
-                       vp9_default_zig_zag1d_32x32, TX_32X32);
-  xd->eobs[0] = c;
-  eobtotal += c;
+                       xd->qcoeff, vp9_default_zig_zag1d_32x32, TX_32X32);
+      xd->eobs[0] = c;
+      eobtotal += c;
 
-  // 16x16 chroma blocks
-  seg_eob = get_eob(xd, segment_id, 256);
-  for (i = 16; i < 24; i += 4) {
-    c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
-                     xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
-                     vp9_default_zig_zag1d_16x16, TX_16X16);
-    xd->eobs[i] = c;
-    eobtotal += c;
+      // 16x16 chroma blocks
+      seg_eob = get_eob(xd, segment_id, 256);
+      for (i = 64; i < 96; i += 16) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    case TX_16X16:
+      // 16x16 luma blocks
+      seg_eob = get_eob(xd, segment_id, 256);
+      for (i = 0; i < 64; i += 16) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+
+      // 16x16 chroma blocks
+      for (i = 64; i < 96; i += 16) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    case TX_8X8:
+      // 8x8 luma blocks
+      seg_eob = get_eob(xd, segment_id, 64);
+      for (i = 0; i < 64; i += 4) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_8x8, TX_8X8);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+
+      // 8x8 chroma blocks
+      for (i = 64; i < 96; i += 4) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_8x8, TX_8X8);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    case TX_4X4:
+      // 4x4 luma blocks
+      seg_eob = get_eob(xd, segment_id, 16);
+      for (i = 0; i < 64; i++) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_4x4, TX_4X4);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+
+      // 4x4 chroma blocks
+      for (i = 64; i < 96; i++) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_4x4, TX_4X4);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    default: assert(0);
+  }
+
+  return eobtotal;
+}
+
+int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,
+                           MACROBLOCKD* const xd,
+                           BOOL_DECODER* const bc) {
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int i, eobtotal = 0, seg_eob, c;
+
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_32X32:
+      // Luma block
+      seg_eob = get_eob(xd, segment_id, 1024);
+      for (i = 0; i < 256; i += 64) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_32x32, TX_32X32);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+
+      // 32x32 chroma blocks
+      for (i = 256; i < 384; i += 64) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_32x32, TX_32X32);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    case TX_16X16:
+      // 16x16 luma blocks
+      seg_eob = get_eob(xd, segment_id, 256);
+      for (i = 0; i < 256; i += 16) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+
+      // 16x16 chroma blocks
+      for (i = 256; i < 384; i += 16) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_16x16, TX_16X16);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    case TX_8X8:
+      // 8x8 luma blocks
+      seg_eob = get_eob(xd, segment_id, 64);
+      for (i = 0; i < 256; i += 4) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_8x8, TX_8X8);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+
+      // 8x8 chroma blocks
+      for (i = 256; i < 384; i += 4) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_8x8, TX_8X8);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    case TX_4X4:
+      // 4x4 luma blocks
+      seg_eob = get_eob(xd, segment_id, 16);
+      for (i = 0; i < 256; i++) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
+                         DCT_DCT, seg_eob, xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_4x4, TX_4X4);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+
+      // 4x4 chroma blocks
+      for (i = 256; i < 384; i++) {
+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob,
+                         xd->qcoeff + i * 16,
+                         vp9_default_zig_zag1d_4x4, TX_4X4);
+        xd->eobs[i] = c;
+        eobtotal += c;
+      }
+      break;
+    default: assert(0);
   }
 
   return eobtotal;
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -14,8 +14,6 @@
 
 #include "vp9/decoder/vp9_onyxd_int.h"
 
-void vp9_reset_mb_tokens_context(MACROBLOCKD* const);
-
 int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
                          BOOL_DECODER* const bc,
                          PLANE_TYPE type, int i);
@@ -26,6 +24,10 @@
 int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
                          MACROBLOCKD* const xd,
                          BOOL_DECODER* const bc);
+
+int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,
+                           MACROBLOCKD* const xd,
+                           BOOL_DECODER* const bc);
 
 int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
                                 BOOL_DECODER* const bc);
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -45,7 +45,7 @@
 vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
 vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
 vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
-vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];
+vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
 
 extern unsigned int active_section;
 #endif
@@ -1229,7 +1229,7 @@
 #ifdef ENTROPY_STATS
                           cpi, context_counters_32x32,
 #endif
-                          cpi->frame_branch_ct_32x32, BLOCK_TYPES_32X32);
+                          cpi->frame_branch_ct_32x32, BLOCK_TYPES);
 }
 
 static void update_coef_probs_common(vp9_writer* const bc,
@@ -1388,7 +1388,7 @@
                              cpi->frame_coef_probs_32x32,
                              cpi->common.fc.coef_probs_32x32,
                              cpi->frame_branch_ct_32x32,
-                             BLOCK_TYPES_32X32);
+                             BLOCK_TYPES);
   }
 }
 
@@ -2106,13 +2106,13 @@
   fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
 
   print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES,
-                             "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]");
+                             "vp9_coef_update_probs_4x4[BLOCK_TYPES]");
   print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES,
-                             "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]");
+                             "vp9_coef_update_probs_8x8[BLOCK_TYPES]");
   print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES,
-                             "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]");
-  print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES_32X32,
-                             "vp9_coef_update_probs_32x32[BLOCK_TYPES_32X32]");
+                             "vp9_coef_update_probs_16x16[BLOCK_TYPES]");
+  print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES,
+                             "vp9_coef_update_probs_32x32[BLOCK_TYPES]");
 
   fclose(f);
   f = fopen("treeupdate.bin", "wb");
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -83,19 +83,12 @@
   int64_t txfm_rd_diff[NB_TXFM_MODES];
 } PICK_MODE_CONTEXT;
 
-typedef struct superblock {
-  DECLARE_ALIGNED(16, int16_t, src_diff[32*32+16*16*2]);
-  DECLARE_ALIGNED(16, int16_t, coeff[32*32+16*16*2]);
-} SUPERBLOCK;
-
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
-  DECLARE_ALIGNED(16, int16_t, src_diff[384]);  // 16x16 Y 8x8 U 8x8 V
-  DECLARE_ALIGNED(16, int16_t, coeff[384]);     // 16x16 Y 8x8 U 8x8 V
+  DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]);
+  DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]);
   // 16 Y blocks, 4 U blocks, 4 V blocks,
   BLOCK block[24];
-
-  SUPERBLOCK sb_coeff_data;
 
   YV12_BUFFER_CONFIG src;
 
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1819,63 +1819,6 @@
 #endif
 }
 
-static void update_sb_skip_coeff_state(VP9_COMP *cpi,
-                                       ENTROPY_CONTEXT_PLANES ta[4],
-                                       ENTROPY_CONTEXT_PLANES tl[4],
-                                       TOKENEXTRA *t[4],
-                                       TOKENEXTRA **tp,
-                                       int skip[4], int output_enabled) {
-  MACROBLOCK *const x = &cpi->mb;
-  TOKENEXTRA tokens[4][16 * 25];
-  int n_tokens[4], n;
-
-  // if there were no skips, we don't need to do anything
-  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
-    return;
-
-  // if we don't do coeff skipping for this frame, we don't
-  // need to do anything here
-  if (!cpi->common.mb_no_coeff_skip)
-    return;
-
-  // if all 4 MBs skipped coeff coding, nothing to be done
-  if (skip[0] && skip[1] && skip[2] && skip[3])
-    return;
-
-  // so the situation now is that we want to skip coeffs
-  // for some MBs, but not all, and we didn't code EOB
-  // coefficients for them. However, the skip flag for this
-  // SB will be 0 overall, so we need to insert EOBs in the
-  // middle of the token tree. Do so here.
-  n_tokens[0] = t[1] - t[0];
-  n_tokens[1] = t[2] - t[1];
-  n_tokens[2] = t[3] - t[2];
-  n_tokens[3] = *tp  - t[3];
-  if (n_tokens[0])
-    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
-  if (n_tokens[1])
-    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
-  if (n_tokens[2])
-    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
-  if (n_tokens[3])
-    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
-
-  // reset pointer, stuff EOBs where necessary
-  *tp = t[0];
-  for (n = 0; n < 4; n++) {
-    if (skip[n]) {
-      x->e_mbd.above_context = &ta[n];
-      x->e_mbd.left_context  = &tl[n];
-      vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
-    } else {
-      if (n_tokens[n]) {
-        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
-      }
-      (*tp) += n_tokens[n];
-    }
-  }
-}
-
 static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
                                          ENTROPY_CONTEXT_PLANES ta[16],
                                          ENTROPY_CONTEXT_PLANES tl[16],
@@ -1993,7 +1936,9 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const int mis = cm->mode_info_stride;
   unsigned char ref_pred_flag;
 
   assert(!xd->mode_info_context->mbmi.sb_type);
@@ -2189,12 +2134,11 @@
     vp9_tokenize_mb(cpi, xd, t, !output_enabled);
 
   } else {
-    int mb_skip_context =
-      cpi->common.mb_no_coeff_skip ?
-      (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-      (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
-      0;
-    if (cpi->common.mb_no_coeff_skip) {
+    // FIXME(rbultje): not tile-aware (mi - 1)
+    int mb_skip_context = cpi->common.mb_no_coeff_skip ?
+      (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;
+
+    if (cm->mb_no_coeff_skip) {
       mbmi->mb_skip_coeff = 1;
       if (output_enabled)
         cpi->skip_true_count[mb_skip_context]++;
@@ -2249,12 +2193,8 @@
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
   unsigned char ref_pred_flag;
-  int n;
-  TOKENEXTRA *tp[4];
-  int skip[4];
   MODE_INFO *mi = x->e_mbd.mode_info_context;
   unsigned int segment_id = mi->mbmi.segment_id;
-  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
   const int mis = cm->mode_info_stride;
 
   if (cm->frame_type == KEY_FRAME) {
@@ -2341,118 +2281,101 @@
                                        mb_row, mb_col);
   }
 
-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
-    if (!x->skip) {
-      vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
-                           dst, dst_y_stride);
-      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
-                            usrc, vsrc, src_uv_stride,
-                            udst, vdst, dst_uv_stride);
-      vp9_transform_sby_32x32(x);
-      vp9_transform_sbuv_16x16(x);
-      vp9_quantize_sby_32x32(x);
-      vp9_quantize_sbuv_16x16(x);
-      // TODO(rbultje): trellis optimize
-      vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);
-      vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);
-      vp9_recon_sby_s_c(&x->e_mbd, dst);
-      vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst);
-
-      vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);
-    } else {
-      int mb_skip_context =
-          cpi->common.mb_no_coeff_skip ?
-          (mi - 1)->mbmi.mb_skip_coeff +
-          (mi - mis)->mbmi.mb_skip_coeff :
-          0;
-      mi->mbmi.mb_skip_coeff = 1;
-      if (cm->mb_no_coeff_skip) {
-        if (output_enabled)
-          cpi->skip_true_count[mb_skip_context]++;
-        vp9_fix_contexts_sb(xd);
-      } else {
-        vp9_stuff_sb(cpi, xd, t, !output_enabled);
-        if (output_enabled)
-          cpi->skip_false_count[mb_skip_context]++;
-      }
+  if (!x->skip) {
+    vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride,
+                         dst, dst_y_stride);
+    vp9_subtract_sbuv_s_c(x->src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    switch (mi->mbmi.txfm_size) {
+      case TX_32X32:
+        vp9_transform_sby_32x32(x);
+        vp9_transform_sbuv_16x16(x);
+        vp9_quantize_sby_32x32(x);
+        vp9_quantize_sbuv_16x16(x);
+        if (x->optimize) {
+          vp9_optimize_sby_32x32(x);
+          vp9_optimize_sbuv_16x16(x);
+        }
+        vp9_inverse_transform_sby_32x32(xd);
+        vp9_inverse_transform_sbuv_16x16(xd);
+        break;
+      case TX_16X16:
+        vp9_transform_sby_16x16(x);
+        vp9_transform_sbuv_16x16(x);
+        vp9_quantize_sby_16x16(x);
+        vp9_quantize_sbuv_16x16(x);
+        if (x->optimize) {
+          vp9_optimize_sby_16x16(x);
+          vp9_optimize_sbuv_16x16(x);
+        }
+        vp9_inverse_transform_sby_16x16(xd);
+        vp9_inverse_transform_sbuv_16x16(xd);
+        break;
+      case TX_8X8:
+        vp9_transform_sby_8x8(x);
+        vp9_transform_sbuv_8x8(x);
+        vp9_quantize_sby_8x8(x);
+        vp9_quantize_sbuv_8x8(x);
+        if (x->optimize) {
+          vp9_optimize_sby_8x8(x);
+          vp9_optimize_sbuv_8x8(x);
+        }
+        vp9_inverse_transform_sby_8x8(xd);
+        vp9_inverse_transform_sbuv_8x8(xd);
+        break;
+      case TX_4X4:
+        vp9_transform_sby_4x4(x);
+        vp9_transform_sbuv_4x4(x);
+        vp9_quantize_sby_4x4(x);
+        vp9_quantize_sbuv_4x4(x);
+        if (x->optimize) {
+          vp9_optimize_sby_4x4(x);
+          vp9_optimize_sbuv_4x4(x);
+        }
+        vp9_inverse_transform_sby_4x4(xd);
+        vp9_inverse_transform_sbuv_4x4(xd);
+        break;
+      default: assert(0);
     }
+    vp9_recon_sby_s_c(xd, dst);
+    vp9_recon_sbuv_s_c(xd, udst, vdst);
 
-    // copy skip flag on all mb_mode_info contexts in this SB
-    // if this was a skip at this txfm size
-    if (mb_col < cm->mb_cols - 1)
-      mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-    if (mb_row < cm->mb_rows - 1) {
-      mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-      if (mb_col < cm->mb_cols - 1)
-        mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-    }
-    skip[0] = skip[2] = skip[1] = skip[3] = mi->mbmi.mb_skip_coeff;
+    vp9_tokenize_sb(cpi, xd, t, !output_enabled);
   } else {
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
+    // FIXME(rbultje): not tile-aware (mi - 1)
+    int mb_skip_context = cm->mb_no_coeff_skip ?
+          (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;
 
-      xd->left_context = cm->left_context + y_idx + (mb_row & 2);
-      xd->above_context = cm->above_context + mb_col + x_idx;
-      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
-      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
-      tp[n] = *t;
-      xd->mode_info_context = mi + x_idx + y_idx * mis;
-
-      if (!x->skip) {
-        vp9_subtract_mby_s_c(x->src_diff,
-                             src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                             src_y_stride,
-                             dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                             dst_y_stride);
-        vp9_subtract_mbuv_s_c(x->src_diff,
-                              usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                              vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                              src_uv_stride,
-                              udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                              vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                              dst_uv_stride);
-        vp9_fidct_mb(x);
-        vp9_recon_mby_s_c(&x->e_mbd,
-                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
-        vp9_recon_mbuv_s_c(&x->e_mbd,
-                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
-
-        vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);
-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
-      } else {
-        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
-            (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-            (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff :
-            0;
-        xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
-        if (cpi->common.mb_no_coeff_skip) {
-          // TODO(rbultje) this should be done per-sb instead of per-mb?
-          if (output_enabled)
-            cpi->skip_true_count[mb_skip_context]++;
-          vp9_reset_mb_tokens_context(xd);
-        } else {
-          vp9_stuff_mb(cpi, xd, t, !output_enabled);
-          // TODO(rbultje) this should be done per-sb instead of per-mb?
-          if (output_enabled)
-            cpi->skip_false_count[mb_skip_context]++;
-        }
-      }
+    mi->mbmi.mb_skip_coeff = 1;
+    if (cm->mb_no_coeff_skip) {
+      if (output_enabled)
+        cpi->skip_true_count[mb_skip_context]++;
+      vp9_reset_sb_tokens_context(xd);
+    } else {
+      vp9_stuff_sb(cpi, xd, t, !output_enabled);
+      if (output_enabled)
+        cpi->skip_false_count[mb_skip_context]++;
     }
+  }
 
-    xd->mode_info_context = mi;
-    update_sb_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled);
+  // copy skip flag on all mb_mode_info contexts in this SB
+  // if this was a skip at this txfm size
+  if (mb_col < cm->mb_cols - 1)
+    mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+  if (mb_row < cm->mb_rows - 1) {
+    mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+    if (mb_col < cm->mb_cols - 1)
+      mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
   }
 
   if (output_enabled) {
     if (cm->txfm_mode == TX_MODE_SELECT &&
-        !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
+        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||
           (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
       cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
     } else {
-      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
-                      TX_32X32 :
-                      cm->txfm_mode;
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
       mi->mbmi.txfm_size = sz;
       if (mb_col < cm->mb_cols - 1)
         mi[1].mbmi.txfm_size = sz;
@@ -2480,11 +2403,8 @@
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
   unsigned char ref_pred_flag;
   int n;
-  TOKENEXTRA *tp[16];
-  int skip[16];
   MODE_INFO *mi = x->e_mbd.mode_info_context;
   unsigned int segment_id = mi->mbmi.segment_id;
-  ENTROPY_CONTEXT_PLANES ta[16], tl[16];
   const int mis = cm->mode_info_stride;
 
   if (cm->frame_type == KEY_FRAME) {
@@ -2570,149 +2490,99 @@
                                        mb_row, mb_col);
   }
 
-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
-    int n;
+  if (!x->skip) {
+    vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);
+    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
+                            udst, vdst, dst_uv_stride);
 
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-
-      xd->mode_info_context = mi + x_idx * 2 + mis * y_idx * 2;
-      xd->left_context = cm->left_context + (y_idx << 1);
-      xd->above_context = cm->above_context + mb_col + (x_idx << 1);
-      memcpy(&ta[n * 2], xd->above_context, sizeof(*ta) * 2);
-      memcpy(&tl[n * 2], xd->left_context, sizeof(*tl) * 2);
-      tp[n] = *t;
-      xd->mode_info_context = mi + x_idx * 2 + y_idx * mis * 2;
-      if (!x->skip) {
-        vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,
-                             src + x_idx * 32 + y_idx * 32 * src_y_stride,
-                             src_y_stride,
-                             dst + x_idx * 32 + y_idx * 32 * dst_y_stride,
-                             dst_y_stride);
-        vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
-                              usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
-                              vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
-                              src_uv_stride,
-                              udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
-                              vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
-                              dst_uv_stride);
-        vp9_transform_sby_32x32(x);
-        vp9_transform_sbuv_16x16(x);
-        vp9_quantize_sby_32x32(x);
-        vp9_quantize_sbuv_16x16(x);
-        // TODO(rbultje): trellis optimize
-        vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);
-        vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);
-        vp9_recon_sby_s_c(&x->e_mbd,
-                          dst + 32 * x_idx + 32 * y_idx * dst_y_stride);
-        vp9_recon_sbuv_s_c(&x->e_mbd,
-                           udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
-                           vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride);
-
-        vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);
-      } else {
-        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
-                              (mi - 1)->mbmi.mb_skip_coeff +
-                                  (mi - mis)->mbmi.mb_skip_coeff : 0;
-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-        if (cm->mb_no_coeff_skip) {
-          if (output_enabled)
-            cpi->skip_true_count[mb_skip_context]++;
-          vp9_fix_contexts_sb(xd);
-        } else {
-          vp9_stuff_sb(cpi, xd, t, !output_enabled);
-          if (output_enabled)
-            cpi->skip_false_count[mb_skip_context]++;
+    switch (xd->mode_info_context->mbmi.txfm_size) {
+      case TX_32X32:
+        vp9_transform_sb64y_32x32(x);
+        vp9_transform_sb64uv_32x32(x);
+        vp9_quantize_sb64y_32x32(x);
+        vp9_quantize_sb64uv_32x32(x);
+        if (x->optimize) {
+          vp9_optimize_sb64y_32x32(x);
+          vp9_optimize_sb64uv_32x32(x);
         }
-      }
-
-      // copy skip flag on all mb_mode_info contexts in this SB
-      // if this was a skip at this txfm size
-      if (mb_col + x_idx * 2 < cm->mb_cols - 1)
-        mi[mis * y_idx * 2 + x_idx * 2 + 1].mbmi.mb_skip_coeff =
-            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
-      if (mb_row + y_idx * 2 < cm->mb_rows - 1) {
-        mi[mis * y_idx * 2 + x_idx * 2 + mis].mbmi.mb_skip_coeff =
-            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
-        if (mb_col + x_idx * 2 < cm->mb_cols - 1)
-          mi[mis * y_idx * 2 + x_idx * 2 + mis + 1].mbmi.mb_skip_coeff =
-              mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
-      }
-      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+        vp9_inverse_transform_sb64y_32x32(xd);
+        vp9_inverse_transform_sb64uv_32x32(xd);
+        break;
+      case TX_16X16:
+        vp9_transform_sb64y_16x16(x);
+        vp9_transform_sb64uv_16x16(x);
+        vp9_quantize_sb64y_16x16(x);
+        vp9_quantize_sb64uv_16x16(x);
+        if (x->optimize) {
+          vp9_optimize_sb64y_16x16(x);
+          vp9_optimize_sb64uv_16x16(x);
+        }
+        vp9_inverse_transform_sb64y_16x16(xd);
+        vp9_inverse_transform_sb64uv_16x16(xd);
+        break;
+      case TX_8X8:
+        vp9_transform_sb64y_8x8(x);
+        vp9_transform_sb64uv_8x8(x);
+        vp9_quantize_sb64y_8x8(x);
+        vp9_quantize_sb64uv_8x8(x);
+        if (x->optimize) {
+          vp9_optimize_sb64y_8x8(x);
+          vp9_optimize_sb64uv_8x8(x);
+        }
+        vp9_inverse_transform_sb64y_8x8(xd);
+        vp9_inverse_transform_sb64uv_8x8(xd);
+        break;
+      case TX_4X4:
+        vp9_transform_sb64y_4x4(x);
+        vp9_transform_sb64uv_4x4(x);
+        vp9_quantize_sb64y_4x4(x);
+        vp9_quantize_sb64uv_4x4(x);
+        if (x->optimize) {
+          vp9_optimize_sb64y_4x4(x);
+          vp9_optimize_sb64uv_4x4(x);
+        }
+        vp9_inverse_transform_sb64y_4x4(xd);
+        vp9_inverse_transform_sb64uv_4x4(xd);
+        break;
+      default: assert(0);
     }
+    vp9_recon_sb64y_s_c(xd, dst);
+    vp9_recon_sb64uv_s_c(&x->e_mbd, udst, vdst);
+
+    vp9_tokenize_sb64(cpi, &x->e_mbd, t, !output_enabled);
   } else {
-    for (n = 0; n < 16; n++) {
-      const int x_idx = n & 3, y_idx = n >> 2;
+    // FIXME(rbultje): not tile-aware (mi - 1)
+    int mb_skip_context = cpi->common.mb_no_coeff_skip ?
+        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;
 
-      xd->left_context = cm->left_context + y_idx;
-      xd->above_context = cm->above_context + mb_col + x_idx;
-      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
-      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
-      tp[n] = *t;
-      xd->mode_info_context = mi + x_idx + y_idx * mis;
-
-      if (!x->skip) {
-        vp9_subtract_mby_s_c(x->src_diff,
-                             src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                             src_y_stride,
-                             dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                             dst_y_stride);
-        vp9_subtract_mbuv_s_c(x->src_diff,
-                              usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                              vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                              src_uv_stride,
-                              udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                              vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                              dst_uv_stride);
-        vp9_fidct_mb(x);
-        vp9_recon_mby_s_c(&x->e_mbd,
-                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
-        vp9_recon_mbuv_s_c(&x->e_mbd,
-                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
-
-        vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);
-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
-      } else {
-        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
-          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-          (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff : 0;
-        xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
-        if (cpi->common.mb_no_coeff_skip) {
-          // TODO(rbultje) this should be done per-sb instead of per-mb?
-          if (output_enabled)
-            cpi->skip_true_count[mb_skip_context]++;
-          vp9_reset_mb_tokens_context(xd);
-        } else {
-          vp9_stuff_mb(cpi, xd, t, !output_enabled);
-          // TODO(rbultje) this should be done per-sb instead of per-mb?
-          if (output_enabled)
-            cpi->skip_false_count[mb_skip_context]++;
-        }
-      }
+    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+    if (cm->mb_no_coeff_skip) {
+      if (output_enabled)
+        cpi->skip_true_count[mb_skip_context]++;
+      vp9_reset_sb64_tokens_context(xd);
+    } else {
+      vp9_stuff_sb64(cpi, xd, t, !output_enabled);
+      if (output_enabled)
+        cpi->skip_false_count[mb_skip_context]++;
     }
   }
 
-  xd->mode_info_context = mi;
-  update_sb64_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled);
+  // copy skip flag on all mb_mode_info contexts in this SB
+  // if this was a skip at this txfm size
+  for (n = 1; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+    if (mb_col + x_idx < cm->mb_cols && mb_row + y_idx < cm->mb_rows)
+      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+  }
 
   if (output_enabled) {
     if (cm->txfm_mode == TX_MODE_SELECT &&
-        !((cm->mb_no_coeff_skip &&
-           ((mi->mbmi.txfm_size == TX_32X32 &&
-             skip[0] && skip[1] && skip[2] && skip[3]) ||
-            (mi->mbmi.txfm_size != TX_32X32 &&
-             skip[0] && skip[1] && skip[2] && skip[3] &&
-             skip[4] && skip[5] && skip[6] && skip[7] &&
-             skip[8] && skip[9] && skip[10] && skip[11] &&
-             skip[12] && skip[13] && skip[14] && skip[15]))) ||
+        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||
           (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
       cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
     } else {
       int x, y;
-      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
-                    TX_32X32 :
-                    cm->txfm_mode;
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
       for (y = 0; y < 4; y++) {
         for (x = 0; x < 4; x++) {
           if (mb_col + x < cm->mb_cols && mb_row + y < cm->mb_rows) {
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -58,7 +58,8 @@
   } else {
     x->fwd_txm4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(x, ib);
-    vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 32);
+    vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],
+                                b->dqcoeff, b->diff, 32);
   }
 
   vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
@@ -174,13 +175,16 @@
       } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
         x->fwd_txm8x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
-        vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);
-        vp9_inverse_transform_b_4x4(xd, ib + iblock[i] + 1, 32);
+        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],
+                                    b->dqcoeff, b->diff, 32);
+        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i] + 1],
+                                    (b + 1)->dqcoeff, (b + 1)->diff, 32);
         i++;
       } else {
         x->fwd_txm4x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4(x, ib + iblock[i]);
-        vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);
+        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],
+                                    b->dqcoeff, b->diff, 32);
       }
     }
   }
@@ -210,7 +214,8 @@
 
   x->fwd_txm4x4(be->src_diff, be->coeff, 16);
   x->quantize_b_4x4(x, ib);
-  vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 16);
+  vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],
+                              b->dqcoeff, b->diff, 16);
 
   vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
                    b->dst_stride);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -146,6 +146,50 @@
   }
 }
 
+void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,
+                            const uint8_t *pred, int dst_stride) {
+  int r, c;
+
+  for (r = 0; r < 64; r++) {
+    for (c = 0; c < 64; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += 64;
+    pred += dst_stride;
+    src  += src_stride;
+  }
+}
+
+void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,
+                             const uint8_t *vsrc, int src_stride,
+                             const uint8_t *upred,
+                             const uint8_t *vpred, int dst_stride) {
+  int16_t *udiff = diff + 4096;
+  int16_t *vdiff = diff + 4096 + 1024;
+  int r, c;
+
+  for (r = 0; r < 32; r++) {
+    for (c = 0; c < 32; c++) {
+      udiff[c] = usrc[c] - upred[c];
+    }
+
+    udiff += 32;
+    upred += dst_stride;
+    usrc  += src_stride;
+  }
+
+  for (r = 0; r < 32; r++) {
+    for (c = 0; c < 32; c++) {
+      vdiff[c] = vsrc[c] - vpred[c];
+    }
+
+    vdiff += 32;
+    vpred += dst_stride;
+    vsrc  += src_stride;
+  }
+}
+
 void vp9_subtract_mby_c(int16_t *diff, uint8_t *src,
                         uint8_t *pred, int stride) {
   vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
@@ -245,17 +289,170 @@
 }
 
 void vp9_transform_sby_32x32(MACROBLOCK *x) {
-  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
-  vp9_short_fdct32x32(x_sb->src_diff, x_sb->coeff, 64);
+  vp9_short_fdct32x32(x->src_diff, x->coeff, 64);
 }
 
+void vp9_transform_sby_16x16(MACROBLOCK *x) {
+  int n;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,
+                    x->coeff + n * 256, 64);
+  }
+}
+
+void vp9_transform_sby_8x8(MACROBLOCK *x) {
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,
+                  x->coeff + n * 64, 64);
+  }
+}
+
+void vp9_transform_sby_4x4(MACROBLOCK *x) {
+  int n;
+
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3;
+
+    x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,
+                  x->coeff + n * 16, 64);
+  }
+}
+
 void vp9_transform_sbuv_16x16(MACROBLOCK *x) {
-  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
   vp9_clear_system_state();
-  x->fwd_txm16x16(x_sb->src_diff + 1024, x_sb->coeff + 1024, 32);
-  x->fwd_txm16x16(x_sb->src_diff + 1280, x_sb->coeff + 1280, 32);
+  x->fwd_txm16x16(x->src_diff + 1024, x->coeff + 1024, 32);
+  x->fwd_txm16x16(x->src_diff + 1280, x->coeff + 1280, 32);
 }
 
+void vp9_transform_sbuv_8x8(MACROBLOCK *x) {
+  int n;
+
+  vp9_clear_system_state();
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->fwd_txm8x8(x->src_diff + 1024 + y_idx * 16 * 8 + x_idx * 8,
+                  x->coeff + 1024 + n * 64, 32);
+    x->fwd_txm8x8(x->src_diff + 1280 + y_idx * 16 * 8 + x_idx * 8,
+                  x->coeff + 1280 + n * 64, 32);
+  }
+}
+
+void vp9_transform_sbuv_4x4(MACROBLOCK *x) {
+  int n;
+
+  vp9_clear_system_state();
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    x->fwd_txm4x4(x->src_diff + 1024 + y_idx * 16 * 4 + x_idx * 4,
+                  x->coeff + 1024 + n * 16, 32);
+    x->fwd_txm4x4(x->src_diff + 1280 + y_idx * 16 * 4 + x_idx * 4,
+                  x->coeff + 1280 + n * 16, 32);
+  }
+}
+
+void vp9_transform_sb64y_32x32(MACROBLOCK *x) {
+  int n;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    vp9_short_fdct32x32(x->src_diff + y_idx * 64 * 32 + x_idx * 32,
+                        x->coeff + n * 1024, 128);
+  }
+}
+
+void vp9_transform_sb64y_16x16(MACROBLOCK *x) {
+  int n;
+
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,
+                    x->coeff + n * 256, 128);
+  }
+}
+
+void vp9_transform_sb64y_8x8(MACROBLOCK *x) {
+  int n;
+
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3;
+
+    x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,
+                  x->coeff + n * 64, 128);
+  }
+}
+
+void vp9_transform_sb64y_4x4(MACROBLOCK *x) {
+  int n;
+
+  for (n = 0; n < 256; n++) {
+    const int x_idx = n & 15, y_idx = n >> 4;
+
+    x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4,
+                  x->coeff + n * 16, 128);
+  }
+}
+
+void vp9_transform_sb64uv_32x32(MACROBLOCK *x) {
+  vp9_clear_system_state();
+  vp9_short_fdct32x32(x->src_diff + 4096,
+                      x->coeff + 4096, 64);
+  vp9_short_fdct32x32(x->src_diff + 4096 + 1024,
+                      x->coeff + 4096 + 1024, 64);
+}
+
+void vp9_transform_sb64uv_16x16(MACROBLOCK *x) {
+  int n;
+
+  vp9_clear_system_state();
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->fwd_txm16x16(x->src_diff + 4096 + y_idx * 32 * 16 + x_idx * 16,
+                    x->coeff + 4096 + n * 256, 64);
+    x->fwd_txm16x16(x->src_diff + 4096 + 1024 + y_idx * 32 * 16 + x_idx * 16,
+                    x->coeff + 4096 + 1024 + n * 256, 64);
+  }
+}
+
+void vp9_transform_sb64uv_8x8(MACROBLOCK *x) {
+  int n;
+
+  vp9_clear_system_state();
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    x->fwd_txm8x8(x->src_diff + 4096 + y_idx * 32 * 8 + x_idx * 8,
+                  x->coeff + 4096 + n * 64, 64);
+    x->fwd_txm8x8(x->src_diff + 4096 + 1024 + y_idx * 32 * 8 + x_idx * 8,
+                  x->coeff + 4096 + 1024 + n * 64, 64);
+  }
+}
+
+void vp9_transform_sb64uv_4x4(MACROBLOCK *x) {
+  int n;
+
+  vp9_clear_system_state();
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3;
+
+    x->fwd_txm4x4(x->src_diff + 4096 + y_idx * 32 * 4 + x_idx * 4,
+                  x->coeff + 4096 + n * 16, 64);
+    x->fwd_txm4x4(x->src_diff + 4096 + 1024 + y_idx * 32 * 4 + x_idx * 4,
+                  x->coeff + 4096 + 1024 + n * 16, 64);
+  }
+}
+
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
@@ -294,21 +491,20 @@
   return vp9_get_coef_context(&recent_energy, token);
 }
 
-static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
+static void optimize_b(MACROBLOCK *mb, int ib, PLANE_TYPE type,
+                       const int16_t *dequant_ptr,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        int tx_size) {
   const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME;
   MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK *b = &mb->block[i];
-  BLOCKD *d = &xd->block[i];
-  vp9_token_state tokens[257][2];
-  unsigned best_index[257][2];
-  const int16_t *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
-  int16_t *qcoeff_ptr = d->qcoeff;
-  int16_t *dqcoeff_ptr = d->dqcoeff;
-  int eob = xd->eobs[i], final_eob, sz = 0;
+  vp9_token_state tokens[1025][2];
+  unsigned best_index[1025][2];
+  const int16_t *coeff_ptr = mb->coeff + ib * 16;
+  int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
+  int16_t *dqcoeff_ptr = xd->dqcoeff + ib * 16;
+  int eob = xd->eobs[ib], final_eob, sz = 0;
   const int i0 = 0;
-  int rc, x, next;
+  int rc, x, next, i;
   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt;
@@ -315,34 +511,15 @@
   int err_mult = plane_rd_mult[type];
   int default_eob;
   int const *scan;
+  const int mul = 1 + (tx_size == TX_32X32);
 
   switch (tx_size) {
     default:
     case TX_4X4:
-      scan = vp9_default_zig_zag1d_4x4;
       default_eob = 16;
-      // TODO: this isn't called (for intra4x4 modes), but will be left in
-      // since it could be used later
-      {
-        TX_TYPE tx_type = get_tx_type_4x4(&mb->e_mbd, d);
-        if (tx_type != DCT_DCT) {
-          switch (tx_type) {
-            case ADST_DCT:
-              scan = vp9_row_scan_4x4;
-              break;
-
-            case DCT_ADST:
-              scan = vp9_col_scan_4x4;
-              break;
-
-            default:
-              scan = vp9_default_zig_zag1d_4x4;
-              break;
-          }
-        } else {
-          scan = vp9_default_zig_zag1d_4x4;
-        }
-      }
+      // FIXME(rbultje): although optimize_b currently isn't called for
+      // intra4x4, this should be changed to be adst-compatible
+      scan = vp9_default_zig_zag1d_4x4;
       break;
     case TX_8X8:
       scan = vp9_default_zig_zag1d_8x8;
@@ -352,6 +529,10 @@
       scan = vp9_default_zig_zag1d_16x16;
       default_eob = 256;
       break;
+    case TX_32X32:
+      scan = vp9_default_zig_zag1d_32x32;
+      default_eob = 1024;
+      break;
   }
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -395,7 +576,7 @@
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+      dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -407,8 +588,9 @@
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&
-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))
+      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
+          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
+                                         dequant_ptr[rc != 0]))
         shortcut = 1;
       else
         shortcut = 0;
@@ -504,7 +686,7 @@
       final_eob = i;
     rc = scan[i];
     qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);
+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
 
     next = tokens[i][best].next;
     best = best_index[i][best];
@@ -511,7 +693,7 @@
   }
   final_eob++;
 
-  xd->eobs[d - xd->block] = final_eob;
+  xd->eobs[ib] = final_eob;
   *a = *l = (final_eob > 0);
 }
 
@@ -531,7 +713,7 @@
   tl = (ENTROPY_CONTEXT *)&t_left;
 
   for (b = 0; b < 16; b++) {
-    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC,
+    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
                ta + vp9_block2above[TX_4X4][b],
                tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
@@ -553,7 +735,7 @@
   tl = (ENTROPY_CONTEXT *)&t_left;
 
   for (b = 16; b < 24; b++) {
-    optimize_b(x, b, PLANE_TYPE_UV,
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
                ta + vp9_block2above[TX_4X4][b],
                tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
@@ -583,7 +765,8 @@
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, &above_ec, &left_ec, TX_8X8);
+    optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
+               &above_ec, &left_ec, TX_8X8);
     a[1] = a[0] = above_ec;
     l[1] = l[0] = left_ec;
   }
@@ -602,7 +785,8 @@
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(x, b, PLANE_TYPE_UV, &above_ec, &left_ec, TX_8X8);
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
+               &above_ec, &left_ec, TX_8X8);
   }
 }
 
@@ -621,12 +805,340 @@
 
   ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;
   tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;
-  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, &ta, &tl, TX_16X16);
+  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+             &ta, &tl, TX_16X16);
 }
 
 static void optimize_mb_16x16(MACROBLOCK *x) {
   vp9_optimize_mby_16x16(x);
   vp9_optimize_mbuv_8x8(x);
+}
+
+void vp9_optimize_sby_32x32(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
+  ENTROPY_CONTEXT ta, tl;
+
+  ta = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+  tl = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+             &ta, &tl, TX_32X32);
+}
+
+void vp9_optimize_sby_16x16(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
+  ENTROPY_CONTEXT ta[2], tl[2];
+  int n;
+
+  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;
+  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;
+  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    optimize_b(x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+               ta + x_idx, tl + y_idx, TX_16X16);
+  }
+}
+
+void vp9_optimize_sby_8x8(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
+  ENTROPY_CONTEXT ta[4], tl[4];
+  int n;
+
+  ta[0] = (a[0] + a[1]) != 0;
+  ta[1] = (a[2] + a[3]) != 0;
+  ta[2] = (a1[0] + a1[1]) != 0;
+  ta[3] = (a1[2] + a1[3]) != 0;
+  tl[0] = (l[0] + l[1]) != 0;
+  tl[1] = (l[2] + l[3]) != 0;
+  tl[2] = (l1[0] + l1[1]) != 0;
+  tl[3] = (l1[2] + l1[3]) != 0;
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    optimize_b(x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+               ta + x_idx, tl + y_idx, TX_8X8);
+  }
+}
+
+void vp9_optimize_sby_4x4(MACROBLOCK *x) {
+  ENTROPY_CONTEXT ta[8], tl[8];
+  int n;
+
+  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3;
+
+    optimize_b(x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+               ta + x_idx, tl + y_idx, TX_4X4);
+  }
+}
+
+void vp9_optimize_sbuv_16x16(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;
+  int b;
+
+  for (b = 64; b < 96; b += 16) {
+    const int cidx = b >= 80 ? 20 : 16;
+    a = ta + vp9_block2above_sb[TX_16X16][b];
+    l = tl + vp9_block2left_sb[TX_16X16][b];
+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+               &above_ec, &left_ec, TX_16X16);
+  }
+}
+
+void vp9_optimize_sbuv_8x8(MACROBLOCK *x) {
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
+  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;
+  int b;
+
+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
+  for (b = 64; b < 96; b += 4) {
+    const int cidx = b >= 80 ? 20 : 16;
+    a = ta + vp9_block2above_sb[TX_8X8][b];
+    l = tl + vp9_block2left_sb[TX_8X8][b];
+    above_ec = (a[0] + a[1]) != 0;
+    left_ec = (l[0] + l[1]) != 0;
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+               &above_ec, &left_ec, TX_8X8);
+    a[0] = a[1] = above_ec;
+    l[0] = l[1] = left_ec;
+  }
+}
+
+void vp9_optimize_sbuv_4x4(MACROBLOCK *x) {
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
+  ENTROPY_CONTEXT *a, *l;
+  int b;
+
+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
+  for (b = 64; b < 96; b++) {
+    const int cidx = b >= 80 ? 20 : 16;
+    a = ta + vp9_block2above_sb[TX_4X4][b];
+    l = tl + vp9_block2left_sb[TX_4X4][b];
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+               a, l, TX_4X4);
+  }
+}
+
+void vp9_optimize_sb64y_32x32(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
+  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
+  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);
+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
+  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);
+  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);
+  ENTROPY_CONTEXT ta[2], tl[2];
+  int n;
+
+  ta[0] = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+  ta[1] = (a2[0] + a2[1] + a2[2] + a2[3] + a3[0] + a3[1] + a3[2] + a3[3]) != 0;
+  tl[0] = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+  tl[1] = (l2[0] + l2[1] + l2[2] + l2[3] + l3[0] + l3[1] + l3[2] + l3[3]) != 0;
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    optimize_b(x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+               ta + x_idx, tl + y_idx, TX_32X32);
+  }
+}
+
+void vp9_optimize_sb64y_16x16(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
+  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
+  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);
+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
+  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);
+  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);
+  ENTROPY_CONTEXT ta[4], tl[4];
+  int n;
+
+  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;
+  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+  ta[2] = (a2[0] + a2[1] + a2[2] + a2[3]) != 0;
+  ta[3] = (a3[0] + a3[1] + a3[2] + a3[3]) != 0;
+  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;
+  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+  tl[2] = (l2[0] + l2[1] + l2[2] + l2[3]) != 0;
+  tl[3] = (l3[0] + l3[1] + l3[2] + l3[3]) != 0;
+  for (n = 0; n < 16; n++) {
+    const int x_idx = n & 3, y_idx = n >> 2;
+
+    optimize_b(x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+               ta + x_idx, tl + y_idx, TX_16X16);
+  }
+}
+
+void vp9_optimize_sb64y_8x8(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
+  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
+  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);
+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
+  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);
+  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);
+  ENTROPY_CONTEXT ta[8], tl[8];
+  int n;
+
+  ta[0] = (a[0] + a[1]) != 0;
+  ta[1] = (a[2] + a[3]) != 0;
+  ta[2] = (a1[0] + a1[1]) != 0;
+  ta[3] = (a1[2] + a1[3]) != 0;
+  ta[4] = (a2[0] + a2[1]) != 0;
+  ta[5] = (a2[2] + a2[3]) != 0;
+  ta[6] = (a3[0] + a3[1]) != 0;
+  ta[7] = (a3[2] + a3[3]) != 0;
+  tl[0] = (l[0] + l[1]) != 0;
+  tl[1] = (l[2] + l[3]) != 0;
+  tl[2] = (l1[0] + l1[1]) != 0;
+  tl[3] = (l1[2] + l1[3]) != 0;
+  tl[4] = (l2[0] + l2[1]) != 0;
+  tl[5] = (l2[2] + l2[3]) != 0;
+  tl[6] = (l3[0] + l3[1]) != 0;
+  tl[7] = (l3[2] + l3[3]) != 0;
+  for (n = 0; n < 64; n++) {
+    const int x_idx = n & 7, y_idx = n >> 3;
+
+    optimize_b(x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+               ta + x_idx, tl + y_idx, TX_8X8);
+  }
+}
+
+void vp9_optimize_sb64y_4x4(MACROBLOCK *x) {
+  ENTROPY_CONTEXT ta[16], tl[16];
+  int n;
+
+  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(ta + 8, x->e_mbd.above_context + 2, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(ta + 12, x->e_mbd.above_context + 3, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(tl + 8, x->e_mbd.left_context + 2, 4 * sizeof(ENTROPY_CONTEXT));
+  vpx_memcpy(tl + 12, x->e_mbd.left_context + 3, 4 * sizeof(ENTROPY_CONTEXT));
+  for (n = 0; n < 256; n++) {
+    const int x_idx = n & 15, y_idx = n >> 4;
+
+    optimize_b(x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
+               ta + x_idx, tl + y_idx, TX_4X4);
+  }
+}
+
+void vp9_optimize_sb64uv_32x32(MACROBLOCK *x) {
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
+  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
+  int b;
+
+  for (b = 256; b < 384; b += 64) {
+    const int cidx = b >= 320 ? 20 : 16;
+    a = ta + vp9_block2above_sb64[TX_32X32][b];
+    l = tl + vp9_block2left_sb64[TX_32X32][b];
+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a2 = a + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l2 = l + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a3 = a + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0;
+    l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0;
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+               &a_ec, &l_ec, TX_32X32);
+  }
+}
+
+void vp9_optimize_sb64uv_16x16(MACROBLOCK *x) {
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
+  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;
+  int b;
+
+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
+  for (b = 256; b < 384; b += 16) {
+    const int cidx = b >= 320 ? 20 : 16;
+    a = ta + vp9_block2above_sb64[TX_16X16][b];
+    l = tl + vp9_block2left_sb64[TX_16X16][b];
+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+               &above_ec, &left_ec, TX_16X16);
+    a[0] = a[1] = a1[0] = a1[1] = above_ec;
+    l[0] = l[1] = l1[0] = l1[1] = left_ec;
+  }
+}
+
+void vp9_optimize_sb64uv_8x8(MACROBLOCK *x) {
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
+  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;
+  int b;
+
+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
+  for (b = 256; b < 384; b += 4) {
+    const int cidx = b >= 320 ? 20 : 16;
+    a = ta + vp9_block2above_sb64[TX_8X8][b];
+    l = tl + vp9_block2left_sb64[TX_8X8][b];
+    above_ec = (a[0] + a[1]) != 0;
+    left_ec = (l[0] + l[1]) != 0;
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+               &above_ec, &left_ec, TX_8X8);
+    a[0] = a[1] = above_ec;
+    l[0] = l[1] = left_ec;
+  }
+}
+
+void vp9_optimize_sb64uv_4x4(MACROBLOCK *x) {
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
+  ENTROPY_CONTEXT *a, *l;
+  int b;
+
+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
+  for (b = 256; b < 384; b++) {
+    const int cidx = b >= 320 ? 20 : 16;
+    a = ta + vp9_block2above_sb64[TX_4X4][b];
+    l = tl + vp9_block2left_sb64[TX_4X4][b];
+    optimize_b(x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
+               a, l, TX_4X4);
+  }
 }
 
 void vp9_fidct_mb(MACROBLOCK *x) {
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -35,7 +35,6 @@
 void vp9_transform_mb_8x8(MACROBLOCK *mb);
 void vp9_transform_mby_8x8(MACROBLOCK *x);
 void vp9_transform_mbuv_8x8(MACROBLOCK *x);
-void vp9_build_dcblock_8x8(MACROBLOCK *b);
 void vp9_optimize_mby_8x8(MACROBLOCK *x);
 void vp9_optimize_mbuv_8x8(MACROBLOCK *x);
 
@@ -44,8 +43,37 @@
 void vp9_optimize_mby_16x16(MACROBLOCK *x);
 
 void vp9_transform_sby_32x32(MACROBLOCK *x);
+void vp9_optimize_sby_32x32(MACROBLOCK *x);
+void vp9_transform_sby_16x16(MACROBLOCK *x);
+void vp9_optimize_sby_16x16(MACROBLOCK *x);
+void vp9_transform_sby_8x8(MACROBLOCK *x);
+void vp9_optimize_sby_8x8(MACROBLOCK *x);
+void vp9_transform_sby_4x4(MACROBLOCK *x);
+void vp9_optimize_sby_4x4(MACROBLOCK *x);
 void vp9_transform_sbuv_16x16(MACROBLOCK *x);
+void vp9_optimize_sbuv_16x16(MACROBLOCK *x);
+void vp9_transform_sbuv_8x8(MACROBLOCK *x);
+void vp9_optimize_sbuv_8x8(MACROBLOCK *x);
+void vp9_transform_sbuv_4x4(MACROBLOCK *x);
+void vp9_optimize_sbuv_4x4(MACROBLOCK *x);
 
+void vp9_transform_sb64y_32x32(MACROBLOCK *x);
+void vp9_optimize_sb64y_32x32(MACROBLOCK *x);
+void vp9_transform_sb64y_16x16(MACROBLOCK *x);
+void vp9_optimize_sb64y_16x16(MACROBLOCK *x);
+void vp9_transform_sb64y_8x8(MACROBLOCK *x);
+void vp9_optimize_sb64y_8x8(MACROBLOCK *x);
+void vp9_transform_sb64y_4x4(MACROBLOCK *x);
+void vp9_optimize_sb64y_4x4(MACROBLOCK *x);
+void vp9_transform_sb64uv_32x32(MACROBLOCK *x);
+void vp9_optimize_sb64uv_32x32(MACROBLOCK *x);
+void vp9_transform_sb64uv_16x16(MACROBLOCK *x);
+void vp9_optimize_sb64uv_16x16(MACROBLOCK *x);
+void vp9_transform_sb64uv_8x8(MACROBLOCK *x);
+void vp9_optimize_sb64uv_8x8(MACROBLOCK *x);
+void vp9_transform_sb64uv_4x4(MACROBLOCK *x);
+void vp9_optimize_sb64uv_4x4(MACROBLOCK *x);
+
 void vp9_fidct_mb(MACROBLOCK *x);
 
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
@@ -63,5 +91,11 @@
                            const uint8_t *vsrc, int src_stride,
                            const uint8_t *upred,
                            const uint8_t *vpred, int dst_stride);
+void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,
+                            const uint8_t *pred, int dst_stride);
+void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,
+                             const uint8_t *vsrc, int src_stride,
+                             const uint8_t *upred,
+                             const uint8_t *vpred, int dst_stride);
 
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -92,7 +92,7 @@
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
   vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];
+  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
 
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
@@ -476,9 +476,9 @@
   vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];
   vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];
 
-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];
-  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES_32X32];
-  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES_32X32];
+  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
+  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
+  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
 
   int gfu_boost;
   int last_boost;
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -344,17 +344,17 @@
 }
 
 void vp9_quantize_sby_32x32(MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &xd->block[0];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
 
   quantize(b->zrun_zbin_boost,
-           x->sb_coeff_data.coeff,
+           x->coeff,
            1024, b->skip_block,
            b->zbin,
            b->round, b->quant, b->quant_shift,
-           xd->sb_coeff_data.qcoeff,
-           xd->sb_coeff_data.dqcoeff,
+           xd->qcoeff,
+           xd->dqcoeff,
            d->dequant,
            b->zbin_extra,
            &xd->eobs[0],
@@ -361,22 +361,284 @@
            vp9_default_zig_zag1d_32x32, 2);
 }
 
+void vp9_quantize_sby_16x16(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
+  int n;
+
+  for (n = 0; n < 4; n++)
+    quantize(b->zrun_zbin_boost,
+             x->coeff + n * 256,
+             256, b->skip_block,
+             b->zbin,
+             b->round, b->quant, b->quant_shift,
+             xd->qcoeff + n * 256,
+             xd->dqcoeff + n * 256,
+             d->dequant,
+             b->zbin_extra,
+             &xd->eobs[n * 16],
+             vp9_default_zig_zag1d_16x16, 1);
+}
+
+void vp9_quantize_sby_8x8(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
+  int n;
+
+  for (n = 0; n < 16; n++)
+    quantize(b->zrun_zbin_boost,
+             x->coeff + n * 64,
+             64, b->skip_block,
+             b->zbin,
+             b->round, b->quant, b->quant_shift,
+             xd->qcoeff + n * 64,
+             xd->dqcoeff + n * 64,
+             d->dequant,
+             b->zbin_extra,
+             &xd->eobs[n * 4],
+             vp9_default_zig_zag1d_8x8, 1);
+}
+
+void vp9_quantize_sby_4x4(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
+  int n;
+
+  for (n = 0; n < 64; n++)
+    quantize(b->zrun_zbin_boost,
+             x->coeff + n * 16,
+             16, b->skip_block,
+             b->zbin,
+             b->round, b->quant, b->quant_shift,
+             xd->qcoeff + n * 16,
+             xd->dqcoeff + n * 16,
+             d->dequant,
+             b->zbin_extra,
+             &xd->eobs[n],
+             vp9_default_zig_zag1d_4x4, 1);
+}
+
 void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
-  for (i = 16; i < 24; i += 4)
-    quantize(x->block[i].zrun_zbin_boost,
-             x->sb_coeff_data.coeff + 1024 + (i - 16) * 64,
-             256, x->block[i].skip_block,
-             x->block[i].zbin,
-             x->block[i].round, x->block[0].quant, x->block[i].quant_shift,
-             xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
-             xd->sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64,
-             xd->block[i].dequant,
-             x->block[i].zbin_extra,
+  for (i = 64; i < 96; i += 16) {
+    int cidx = i < 80 ? 16 : 20;
+    quantize(x->block[cidx].zrun_zbin_boost,
+             x->coeff + i * 16,
+             256, x->block[cidx].skip_block,
+             x->block[cidx].zbin, x->block[cidx].round,
+             x->block[cidx].quant, x->block[cidx].quant_shift,
+             xd->qcoeff + i * 16,
+             xd->dqcoeff + i * 16,
+             xd->block[cidx].dequant,
+             x->block[cidx].zbin_extra,
              &xd->eobs[i],
              vp9_default_zig_zag1d_16x16, 1);
+  }
+}
+
+void vp9_quantize_sbuv_8x8(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  for (i = 64; i < 96; i += 4) {
+    int cidx = i < 80 ? 16 : 20;
+    quantize(x->block[cidx].zrun_zbin_boost,
+             x->coeff + i * 16,
+             64, x->block[cidx].skip_block,
+             x->block[cidx].zbin, x->block[cidx].round,
+             x->block[cidx].quant, x->block[cidx].quant_shift,
+             xd->qcoeff + i * 16,
+             xd->dqcoeff + i * 16,
+             xd->block[cidx].dequant,
+             x->block[cidx].zbin_extra,
+             &xd->eobs[i],
+             vp9_default_zig_zag1d_8x8, 1);
+  }
+}
+
+void vp9_quantize_sbuv_4x4(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  for (i = 64; i < 96; i++) {
+    int cidx = i < 80 ? 16 : 20;
+    quantize(x->block[cidx].zrun_zbin_boost,
+             x->coeff + i * 16,
+             16, x->block[cidx].skip_block,
+             x->block[cidx].zbin, x->block[cidx].round,
+             x->block[cidx].quant, x->block[cidx].quant_shift,
+             xd->qcoeff + i * 16,
+             xd->dqcoeff + i * 16,
+             xd->block[cidx].dequant,
+             x->block[cidx].zbin_extra,
+             &xd->eobs[i],
+             vp9_default_zig_zag1d_4x4, 1);
+  }
+}
+
+void vp9_quantize_sb64y_32x32(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
+  int n;
+
+  for (n = 0; n < 4; n++)
+    quantize(b->zrun_zbin_boost,
+             x->coeff + n * 1024,
+             1024, b->skip_block,
+             b->zbin,
+             b->round, b->quant, b->quant_shift,
+             xd->qcoeff + n * 1024,
+             xd->dqcoeff + n * 1024,
+             d->dequant,
+             b->zbin_extra,
+             &xd->eobs[n * 64],
+             vp9_default_zig_zag1d_32x32, 2);
+}
+
+void vp9_quantize_sb64y_16x16(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
+  int n;
+
+  for (n = 0; n < 16; n++)
+    quantize(b->zrun_zbin_boost,
+             x->coeff + n * 256,
+             256, b->skip_block,
+             b->zbin,
+             b->round, b->quant, b->quant_shift,
+             xd->qcoeff + n * 256,
+             xd->dqcoeff + n * 256,
+             d->dequant,
+             b->zbin_extra,
+             &xd->eobs[n * 16],
+             vp9_default_zig_zag1d_16x16, 1);
+}
+
+void vp9_quantize_sb64y_8x8(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
+  int n;
+
+  for (n = 0; n < 64; n++)
+    quantize(b->zrun_zbin_boost,
+             x->coeff + n * 64,
+             64, b->skip_block,
+             b->zbin,
+             b->round, b->quant, b->quant_shift,
+             xd->qcoeff + n * 64,
+             xd->dqcoeff + n * 64,
+             d->dequant,
+             b->zbin_extra,
+             &xd->eobs[n * 4],
+             vp9_default_zig_zag1d_8x8, 1);
+}
+
+void vp9_quantize_sb64y_4x4(MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const b = &x->block[0];
+  BLOCKD *const d = &xd->block[0];
+  int n;
+
+  for (n = 0; n < 256; n++)
+    quantize(b->zrun_zbin_boost,
+             x->coeff + n * 16,
+             16, b->skip_block,
+             b->zbin,
+             b->round, b->quant, b->quant_shift,
+             xd->qcoeff + n * 16,
+             xd->dqcoeff + n * 16,
+             d->dequant,
+             b->zbin_extra,
+             &xd->eobs[n],
+             vp9_default_zig_zag1d_4x4, 1);
+}
+
+void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  for (i = 256; i < 384; i += 64) {
+    int cidx = i < 320 ? 16 : 20;
+    quantize(x->block[cidx].zrun_zbin_boost,
+             x->coeff + i * 16,
+             1024, x->block[cidx].skip_block,
+             x->block[cidx].zbin, x->block[cidx].round,
+             x->block[cidx].quant, x->block[cidx].quant_shift,
+             xd->qcoeff + i * 16,
+             xd->dqcoeff + i * 16,
+             xd->block[cidx].dequant,
+             x->block[cidx].zbin_extra,
+             &xd->eobs[i],
+             vp9_default_zig_zag1d_32x32, 2);
+  }
+}
+
+void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  for (i = 256; i < 384; i += 16) {
+    int cidx = i < 320 ? 16 : 20;
+    quantize(x->block[cidx].zrun_zbin_boost,
+             x->coeff + i * 16,
+             256, x->block[cidx].skip_block,
+             x->block[cidx].zbin, x->block[cidx].round,
+             x->block[cidx].quant, x->block[cidx].quant_shift,
+             xd->qcoeff + i * 16,
+             xd->dqcoeff + i * 16,
+             xd->block[cidx].dequant,
+             x->block[cidx].zbin_extra,
+             &xd->eobs[i],
+             vp9_default_zig_zag1d_16x16, 1);
+  }
+}
+
+void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  for (i = 256; i < 384; i += 4) {
+    int cidx = i < 320 ? 16 : 20;
+    quantize(x->block[cidx].zrun_zbin_boost,
+             x->coeff + i * 16,
+             64, x->block[cidx].skip_block,
+             x->block[cidx].zbin, x->block[cidx].round,
+             x->block[cidx].quant, x->block[cidx].quant_shift,
+             xd->qcoeff + i * 16,
+             xd->dqcoeff + i * 16,
+             xd->block[cidx].dequant,
+             x->block[cidx].zbin_extra,
+             &xd->eobs[i],
+             vp9_default_zig_zag1d_8x8, 1);
+  }
+}
+
+void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) {
+  int i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  for (i = 256; i < 384; i++) {
+    int cidx = i < 320 ? 16 : 20;
+    quantize(x->block[cidx].zrun_zbin_boost,
+             x->coeff + i * 16,
+             16, x->block[cidx].skip_block,
+             x->block[cidx].zbin, x->block[cidx].round,
+             x->block[cidx].quant, x->block[cidx].quant_shift,
+             xd->qcoeff + i * 16,
+             xd->dqcoeff + i * 16,
+             xd->block[cidx].dequant,
+             x->block[cidx].zbin_extra,
+             &xd->eobs[i],
+             vp9_default_zig_zag1d_4x4, 1);
+  }
 }
 
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -74,7 +74,21 @@
 extern prototype_quantize_mb(vp9_quantize_mby_16x16);
 
 void vp9_quantize_sby_32x32(MACROBLOCK *x);
+void vp9_quantize_sby_16x16(MACROBLOCK *x);
+void vp9_quantize_sby_8x8(MACROBLOCK *x);
+void vp9_quantize_sby_4x4(MACROBLOCK *x);
 void vp9_quantize_sbuv_16x16(MACROBLOCK *x);
+void vp9_quantize_sbuv_8x8(MACROBLOCK *x);
+void vp9_quantize_sbuv_4x4(MACROBLOCK *x);
+
+void vp9_quantize_sb64y_32x32(MACROBLOCK *x);
+void vp9_quantize_sb64y_16x16(MACROBLOCK *x);
+void vp9_quantize_sb64y_8x8(MACROBLOCK *x);
+void vp9_quantize_sb64y_4x4(MACROBLOCK *x);
+void vp9_quantize_sb64uv_32x32(MACROBLOCK *x);
+void vp9_quantize_sb64uv_16x16(MACROBLOCK *x);
+void vp9_quantize_sb64uv_8x8(MACROBLOCK *x);
+void vp9_quantize_sb64uv_4x4(MACROBLOCK *x);
 
 struct VP9_COMP;
 
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -273,7 +273,7 @@
   fill_token_costs(cpi->mb.token_costs[TX_16X16],
                    cpi->common.fc.coef_probs_16x16, BLOCK_TYPES);
   fill_token_costs(cpi->mb.token_costs[TX_32X32],
-                   cpi->common.fc.coef_probs_32x32, BLOCK_TYPES_32X32);
+                   cpi->common.fc.coef_probs_32x32, BLOCK_TYPES);
 
   /*rough estimate for costing*/
   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
@@ -380,25 +380,27 @@
 }
 
 static INLINE int cost_coeffs(MACROBLOCK *mb,
-                              BLOCKD *b, PLANE_TYPE type,
+                              int ib, PLANE_TYPE type,
                               ENTROPY_CONTEXT *a,
                               ENTROPY_CONTEXT *l,
                               TX_SIZE tx_size) {
-  int pt;
   MACROBLOCKD *const xd = &mb->e_mbd;
-  const int ib = (int)(b - xd->block);
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+  int pt;
   const int eob = xd->eobs[ib];
   int c = 0;
   int cost = 0, seg_eob;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int segment_id = mbmi->segment_id;
   const int *scan;
-  int16_t *qcoeff_ptr = b->qcoeff;
-  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, b) : DCT_DCT;
+  const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
+  const int ref = mbmi->ref_frame != INTRA_FRAME;
+  const TX_TYPE tx_type = (sb_type == BLOCK_SIZE_MB16X16 &&
+                           type == PLANE_TYPE_Y_WITH_DC) ?
+                          get_tx_type(xd, &xd->block[ib]) : DCT_DCT;
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
-  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+  ENTROPY_CONTEXT a_ec, l_ec;
   ENTROPY_CONTEXT *const a1 = a +
       sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
   ENTROPY_CONTEXT *const l1 = l +
@@ -406,6 +408,8 @@
 
   switch (tx_size) {
     case TX_4X4:
+      a_ec = *a;
+      l_ec = *l;
       scan = vp9_default_zig_zag1d_4x4;
       seg_eob = 16;
       if (type == PLANE_TYPE_Y_WITH_DC) {
@@ -426,8 +430,6 @@
       scan = vp9_default_zig_zag1d_16x16;
       seg_eob = 256;
       if (type == PLANE_TYPE_UV) {
-        const int uv_idx = ib - 16;
-        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx;
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       } else {
@@ -438,11 +440,22 @@
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       seg_eob = 1024;
-      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
-      a_ec = (a[0] + a[1] + a[2] + a[3] +
-              a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-      l_ec = (l[0] + l[1] + l[2] + l[3] +
-              l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+      if (type == PLANE_TYPE_UV) {
+        ENTROPY_CONTEXT *a2, *a3, *l2, *l3;
+        a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+        a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+        l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+        l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+        a_ec = (a[0] + a[1] + a1[0] + a1[1] +
+                a2[0] + a2[1] + a3[0] + a3[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1] +
+                l2[0] + l2[1] + l3[0] + l3[1]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a[2] + a[3] +
+                a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3] +
+                l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+      }
       break;
     default:
       abort();
@@ -508,7 +521,7 @@
   }
 
   for (b = 0; b < 16; b++)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC,
+    cost += cost_coeffs(mb, b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_4X4][b],
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
@@ -551,7 +564,7 @@
   }
 
   for (b = 0; b < 16; b += 4)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_WITH_DC,
+    cost += cost_coeffs(mb, b, PLANE_TYPE_Y_WITH_DC,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b],
                         TX_8X8);
@@ -591,7 +604,7 @@
     tl = (ENTROPY_CONTEXT *)xd->left_context;
   }
 
-  cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
+  cost = cost_coeffs(mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
   return cost;
 }
 
@@ -741,7 +754,7 @@
     tl = (ENTROPY_CONTEXT *) xd->left_context;
   }
 
-  return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
+  return cost_coeffs(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
 }
 
 static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
@@ -761,9 +774,7 @@
 static void super_block_yrd_32x32(MACROBLOCK *x,
                                   int *rate, int *distortion, int *skippable,
                                   int backup) {
-  SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
+  MACROBLOCKD *const xd = &x->e_mbd;
 #if DEBUG_ERROR
   int16_t out[1024];
 #endif
@@ -771,17 +782,17 @@
   vp9_transform_sby_32x32(x);
   vp9_quantize_sby_32x32(x);
 #if DEBUG_ERROR
-  vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);
+  vp9_short_idct32x32(xd->dqcoeff, out, 64);
 #endif
 
-  *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024);
 
 #if DEBUG_ERROR
   printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
-         vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);
+         vp9_block_error_c(x->src_diff, out, 1024), *distortion);
 #endif
   *rate       = rdcost_sby_32x32(x, backup);
-  *skippable  = vp9_sby_is_skippable_32x32(&x->e_mbd);
+  *skippable  = vp9_sby_is_skippable_32x32(xd);
 }
 
 static void super_block_yrd(VP9_COMP *cpi,
@@ -805,7 +816,7 @@
     s[n] = 1;
   }
 
-  vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
+  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride,
                        dst, dst_y_stride);
   super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1);
 
@@ -894,7 +905,7 @@
 
     xd->above_context = &t_above[TX_32X32][x_idx << 1];
     xd->left_context = &t_left[TX_32X32][y_idx << 1];
-    vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,
+    vp9_subtract_sby_s_c(x->src_diff,
                          src + 32 * x_idx + 32 * y_idx * src_y_stride,
                          src_y_stride,
                          dst + 32 * x_idx + 32 * y_idx * dst_y_stride,
@@ -1049,7 +1060,8 @@
     tempa = ta;
     templ = tl;
 
-    ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
+    ratey = cost_coeffs(x, b - xd->block,
+                        PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
     rate += ratey;
     distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
 
@@ -1353,7 +1365,7 @@
       ta1 = ta0 + 1;
       tl1 = tl0 + 1;
 
-      rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
+      rate_t = cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
                            ta0, tl0, TX_8X8);
 
       rate += rate_t;
@@ -1386,12 +1398,12 @@
           x->quantize_b_4x4(x, ib + iblock[i]);
         }
         distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
-        rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC,
+        rate_t += cost_coeffs(x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                               i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                               TX_4X4);
         if (do_two) {
           i++;
-          rate_t += cost_coeffs(x, b + 1, PLANE_TYPE_Y_WITH_DC,
+          rate_t += cost_coeffs(x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
                                 i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                                 TX_4X4);
         }
@@ -1498,7 +1510,7 @@
   }
 
   for (b = 16; b < 24; b++)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
+    cost += cost_coeffs(mb, b, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_4X4][b],
                         tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
@@ -1539,7 +1551,7 @@
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
+    cost += cost_coeffs(mb, b, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_8X8);
 
@@ -1578,7 +1590,7 @@
   }
 
   for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV,
+    cost += cost_coeffs(x, b * 4, PLANE_TYPE_UV,
                         ta + vp9_block2above[TX_8X8][b],
                         tl + vp9_block2left[TX_8X8][b], TX_16X16);
 
@@ -1594,8 +1606,8 @@
   vp9_quantize_sbuv_16x16(x);
 
   *rate       = rd_cost_sbuv_16x16(x, backup);
-  *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024,
-                                   xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2;
+  *distortion = vp9_block_error_c(x->coeff + 1024,
+                                  xd->dqcoeff + 1024, 512) >> 2;
   *skip       = vp9_sbuv_is_skippable_16x16(xd);
 }
 
@@ -1607,8 +1619,8 @@
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
 
-  if (mbmi->txfm_size == TX_32X32) {
-    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+  if (mbmi->txfm_size >= TX_16X16) {
+    vp9_subtract_sbuv_s_c(x->src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
     rd_inter32x32_uv_16x16(x, rate, distortion, skip, 1);
@@ -1787,8 +1799,8 @@
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
 
-  if (mbmi->txfm_size == TX_32X32) {
-    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+  if (mbmi->txfm_size >= TX_16X16) {
+    vp9_subtract_sbuv_s_c(x->src_diff,
                           usrc, vsrc, src_uv_stride,
                           udst, vdst, dst_uv_stride);
     rd_inter32x32_uv_16x16(x, rate, distortion, skippable, 1);
@@ -1840,6 +1852,46 @@
   }
 }
 
+static int rd_cost_sb64uv_32x32(MACROBLOCK *x, int backup) {
+  int b;
+  int cost = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT *ta, *tl;
+
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
+
+    ta = (ENTROPY_CONTEXT *) &t_above;
+    tl = (ENTROPY_CONTEXT *) &t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
+
+  for (b = 16; b < 24; b += 4)
+    cost += cost_coeffs(x, b * 16, PLANE_TYPE_UV,
+                        ta + vp9_block2above[TX_8X8][b],
+                        tl + vp9_block2left[TX_8X8][b], TX_32X32);
+
+  return cost;
+}
+
+static void rd_inter64x64_uv_32x32(MACROBLOCK *x, int *rate,
+                                   int *distortion, int *skip,
+                                   int backup) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  vp9_transform_sb64uv_32x32(x);
+  vp9_quantize_sb64uv_32x32(x);
+
+  *rate       = rd_cost_sb64uv_32x32(x, backup);
+  *distortion = vp9_block_error_c(x->coeff + 4096,
+                                  xd->dqcoeff + 4096, 2048);
+  *skip       = vp9_sb64uv_is_skippable_32x32(xd);
+}
+
 static void super_block_64_uvrd(MACROBLOCK *x,
                                 int *rate,
                                 int *distortion,
@@ -1854,10 +1906,15 @@
   ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
   int d = 0, r = 0, n, s = 1;
 
+  // FIXME not needed if tx=32x32
   memcpy(t_above, xd->above_context, sizeof(t_above));
   memcpy(t_left,  xd->left_context,  sizeof(t_left));
 
   if (mbmi->txfm_size == TX_32X32) {
+    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
+                            udst, vdst, dst_uv_stride);
+    rd_inter64x64_uv_32x32(x, &r, &d, &s, 1);
+  } else if (mbmi->txfm_size == TX_16X16) {
     int n;
 
     *rate = 0;
@@ -1865,7 +1922,7 @@
       int x_idx = n & 1, y_idx = n >> 1;
       int r_tmp, d_tmp, s_tmp;
 
-      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+      vp9_subtract_sbuv_s_c(x->src_diff,
                             usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
                             vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
                             src_uv_stride,
@@ -2168,7 +2225,7 @@
       x->quantize_b_4x4(x, i);
       thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(x, i, PLANE_TYPE_Y_WITH_DC,
                                  ta + vp9_block2above[TX_4X4][i],
                                  tl + vp9_block2left[TX_4X4][i], TX_4X4);
     }
@@ -2231,10 +2288,10 @@
           x->quantize_b_8x8(x, idx);
           thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
           otherdist += thisdistortion;
-          othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
-                                     tacp + vp9_block2above[TX_8X8][idx],
-                                     tlcp + vp9_block2left[TX_8X8][idx],
-                                     TX_8X8);
+          othercost += cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
+                                   tacp + vp9_block2above[TX_8X8][idx],
+                                   tlcp + vp9_block2left[TX_8X8][idx],
+                                   TX_8X8);
         }
         for (j = 0; j < 4; j += 2) {
           bd = &xd->block[ib + iblock[j]];
@@ -2243,11 +2300,12 @@
           x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);
           thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
           *distortion += thisdistortion;
-          *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+          *labelyrate += cost_coeffs(x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
                            ta + vp9_block2above[TX_4X4][ib + iblock[j]],
                            tl + vp9_block2left[TX_4X4][ib + iblock[j]],
                            TX_4X4);
-          *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
+          *labelyrate += cost_coeffs(x, ib + iblock[j] + 1,
+                           PLANE_TYPE_Y_WITH_DC,
                            ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
                            tl + vp9_block2left[TX_4X4][ib + iblock[j]],
                            TX_4X4);
@@ -2261,11 +2319,12 @@
             x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]);
             thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
             otherdist += thisdistortion;
-            othercost += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+            othercost += cost_coeffs(x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
                            tacp + vp9_block2above[TX_4X4][ib + iblock[j]],
                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
                            TX_4X4);
-            othercost += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
+            othercost += cost_coeffs(x, ib + iblock[j] + 1,
+                           PLANE_TYPE_Y_WITH_DC,
                            tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
                            TX_4X4);
@@ -2275,7 +2334,7 @@
         x->quantize_b_8x8(x, idx);
         thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
         *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
+        *labelyrate += cost_coeffs(x, idx, PLANE_TYPE_Y_WITH_DC,
                                    ta + vp9_block2above[TX_8X8][idx],
                                    tl + vp9_block2left[TX_8X8][idx], TX_8X8);
       }
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -28,12 +28,12 @@
 vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];
 vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];
 vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];
-vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];
+vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];
 
 extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
 extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
 extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];
+extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
@@ -101,37 +101,52 @@
                        PLANE_TYPE type,
                        TX_SIZE tx_size,
                        int dry_run) {
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt; /* near block/prev token context index */
   int c = 0;
   int recent_energy = 0;
-  const BLOCKD * const b = xd->block + ib;
   const int eob = xd->eobs[ib];     /* one beyond last nonzero coeff */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  int16_t *qcoeff_ptr = b->qcoeff;
+  int16_t *qcoeff_ptr = xd->qcoeff + 16 * ib;
   int seg_eob;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
   const int *scan;
   vp9_coeff_count *counts;
   vp9_coeff_probs *probs;
-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                          get_tx_type(xd, b) : DCT_DCT;
-  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
+  const TX_TYPE tx_type = (sb_type == BLOCK_SIZE_MB16X16 &&
+                           type == PLANE_TYPE_Y_WITH_DC) ?
+                          get_tx_type(xd, &xd->block[ib]) : DCT_DCT;
+  const int ref = mbmi->ref_frame != INTRA_FRAME;
+  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
 
-  ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +
-      vp9_block2above[tx_size][ib];
-  ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context +
-      vp9_block2left[tx_size][ib];
-  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+  if (sb_type == BLOCK_SIZE_SB64X64) {
+    a = (ENTROPY_CONTEXT *)xd->above_context +
+                                             vp9_block2above_sb64[tx_size][ib];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];
+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+  } else if (sb_type == BLOCK_SIZE_SB32X32) {
+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];
+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a2 = a3 = l2 = l3 = NULL;
+  } else {
+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];
+    a1 = l1 = a2 = l2 = a3 = l3 = NULL;
+  }
 
-  ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) +
-      vp9_block2above[tx_size][ib];
-  ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) +
-      vp9_block2left[tx_size][ib];
-
-
   switch (tx_size) {
     default:
     case TX_4X4:
+      a_ec = *a;
+      l_ec = *l;
       seg_eob = 16;
       scan = vp9_default_zig_zag1d_4x4;
       if (tx_type != DCT_DCT) {
@@ -164,23 +179,23 @@
       scan = vp9_default_zig_zag1d_16x16;
       counts = cpi->coef_counts_16x16;
       probs = cpi->common.fc.coef_probs_16x16;
-      if (type == PLANE_TYPE_UV) {
-        int uv_idx = (ib - 16) >> 2;
-        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx;
-      }
       break;
     case TX_32X32:
-      a_ec = a[0] + a[1] + a[2] + a[3] +
-             a1[0] + a1[1] + a1[2] + a1[3];
-      l_ec = l[0] + l[1] + l[2] + l[3] +
-             l1[0] + l1[1] + l1[2] + l1[3];
-      a_ec = a_ec != 0;
-      l_ec = l_ec != 0;
+      if (type != PLANE_TYPE_UV) {
+        a_ec = (a[0] + a[1] + a[2] + a[3] +
+                a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3] +
+                l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a1[0] + a1[1] +
+                a2[0] + a2[1] + a3[0] + a3[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1] +
+                l2[0] + l2[1] + l3[0] + l3[1]) != 0;
+      }
       seg_eob = 1024;
       scan = vp9_default_zig_zag1d_32x32;
       counts = cpi->coef_counts_32x32;
       probs = cpi->common.fc.coef_probs_32x32;
-      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
       break;
   }
 
@@ -233,10 +248,17 @@
       l1[0] = l1[1] = l[1] = l_ec;
     }
   } else if (tx_size == TX_32X32) {
-    a[1] = a[2] = a[3] = a_ec;
-    l[1] = l[2] = l[3] = l_ec;
-    a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
-    l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
+    if (type != PLANE_TYPE_UV) {
+      a[1] = a[2] = a[3] = a_ec;
+      l[1] = l[2] = l[3] = l_ec;
+      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
+      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
+    } else {
+      a[1] = a1[0] = a1[1] = a_ec;
+      l[1] = l1[0] = l1[1] = l_ec;
+      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;
+      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;
+    }
   }
 }
 
@@ -289,9 +311,7 @@
 }
 
 int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {
-  int skip = 1;
-  skip &= !xd->eobs[0];
-  return skip;
+  return (!xd->eobs[0]);
 }
 
 static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
@@ -299,13 +319,11 @@
 }
 
 int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {
-  int skip = 1;
-  skip &= !xd->eobs[0];
-  return skip;
+  return (!xd->eobs[0]);
 }
 
 int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {
-  return (!xd->eobs[16]) & (!xd->eobs[20]);
+  return (!xd->eobs[64]) & (!xd->eobs[80]);
 }
 
 static int sb_is_skippable_32x32(MACROBLOCKD *xd) {
@@ -313,6 +331,68 @@
          vp9_sbuv_is_skippable_16x16(xd);
 }
 
+static int sby_is_skippable_16x16(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 0; i < 64; i += 16)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb_is_skippable_16x16(MACROBLOCKD *xd) {
+  return sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd);
+}
+
+static int sby_is_skippable_8x8(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 0; i < 64; i += 4)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sbuv_is_skippable_8x8(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 64; i < 96; i += 4)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb_is_skippable_8x8(MACROBLOCKD *xd) {
+  return sby_is_skippable_8x8(xd) & sbuv_is_skippable_8x8(xd);
+}
+
+static int sby_is_skippable_4x4(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 0; i < 64; i++)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sbuv_is_skippable_4x4(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 64; i < 96; i++)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb_is_skippable_4x4(MACROBLOCKD *xd) {
+  return sby_is_skippable_4x4(xd) & sbuv_is_skippable_4x4(xd);
+}
+
 void vp9_tokenize_sb(VP9_COMP *cpi,
                      MACROBLOCKD *xd,
                      TOKENEXTRA **t,
@@ -325,7 +405,21 @@
   const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
   int b;
 
-  mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);
+  switch (mbmi->txfm_size) {
+    case TX_32X32:
+      mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);
+      break;
+    case TX_16X16:
+      mbmi->mb_skip_coeff = sb_is_skippable_16x16(xd);
+      break;
+    case TX_8X8:
+      mbmi->mb_skip_coeff = sb_is_skippable_8x8(xd);
+      break;
+    case TX_4X4:
+      mbmi->mb_skip_coeff = sb_is_skippable_4x4(xd);
+      break;
+    default: assert(0);
+  }
 
   if (mbmi->mb_skip_coeff) {
     if (!dry_run)
@@ -333,7 +427,7 @@
     if (!cm->mb_no_coeff_skip) {
       vp9_stuff_sb(cpi, xd, t, dry_run);
     } else {
-      vp9_fix_contexts_sb(xd);
+      vp9_reset_sb_tokens_context(xd);
     }
     if (dry_run)
       *t = t_backup;
@@ -343,13 +437,215 @@
   if (!dry_run)
     cpi->skip_false_count[mb_skip_context] += skip_inc;
 
-  tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC,
-             TX_32X32, dry_run);
+  switch (mbmi->txfm_size) {
+    case TX_32X32:
+      tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC,
+                 TX_32X32, dry_run);
+      for (b = 64; b < 96; b += 16)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_16X16, dry_run);
+      break;
+    case TX_16X16:
+      for (b = 0; b < 64; b += 16)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
+                   TX_16X16, dry_run);
+      for (b = 64; b < 96; b += 16)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_16X16, dry_run);
+      break;
+    case TX_8X8:
+      for (b = 0; b < 64; b += 4)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
+                   TX_8X8, dry_run);
+      for (b = 64; b < 96; b += 4)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_8X8, dry_run);
+      break;
+    case TX_4X4:
+      for (b = 0; b < 64; b++)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
+                   TX_4X4, dry_run);
+      for (b = 64; b < 96; b++)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_4X4, dry_run);
+      break;
+    default: assert(0);
+  }
 
-  for (b = 16; b < 24; b += 4) {
-    tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-               TX_16X16, dry_run);
+  if (dry_run)
+    *t = t_backup;
+}
+
+static int sb64y_is_skippable_32x32(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 0; i < 256; i += 64)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd) {
+  return (!xd->eobs[256]) & (!xd->eobs[320]);
+}
+
+static int sb64_is_skippable_32x32(MACROBLOCKD *xd) {
+  return sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd);
+}
+
+static int sb64y_is_skippable_16x16(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 0; i < 256; i += 16)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb64uv_is_skippable_16x16(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 256; i < 384; i += 16)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb64_is_skippable_16x16(MACROBLOCKD *xd) {
+  return sb64y_is_skippable_16x16(xd) & sb64uv_is_skippable_16x16(xd);
+}
+
+static int sb64y_is_skippable_8x8(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 0; i < 256; i += 4)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb64uv_is_skippable_8x8(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 256; i < 384; i += 4)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb64_is_skippable_8x8(MACROBLOCKD *xd) {
+  return sb64y_is_skippable_8x8(xd) & sb64uv_is_skippable_8x8(xd);
+}
+
+static int sb64y_is_skippable_4x4(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 0; i < 256; i++)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb64uv_is_skippable_4x4(MACROBLOCKD *xd) {
+  int skip = 1;
+  int i = 0;
+
+  for (i = 256; i < 384; i++)
+    skip &= (!xd->eobs[i]);
+
+  return skip;
+}
+
+static int sb64_is_skippable_4x4(MACROBLOCKD *xd) {
+  return sb64y_is_skippable_4x4(xd) & sb64uv_is_skippable_4x4(xd);
+}
+
+void vp9_tokenize_sb64(VP9_COMP *cpi,
+                       MACROBLOCKD *xd,
+                       TOKENEXTRA **t,
+                       int dry_run) {
+  VP9_COMMON * const cm = &cpi->common;
+  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
+  const int segment_id = mbmi->segment_id;
+  const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+  int b;
+
+  switch (mbmi->txfm_size) {
+    case TX_32X32:
+      mbmi->mb_skip_coeff = sb64_is_skippable_32x32(xd);
+      break;
+    case TX_16X16:
+      mbmi->mb_skip_coeff = sb64_is_skippable_16x16(xd);
+      break;
+    case TX_8X8:
+      mbmi->mb_skip_coeff = sb64_is_skippable_8x8(xd);
+      break;
+    case TX_4X4:
+      mbmi->mb_skip_coeff = sb64_is_skippable_4x4(xd);
+      break;
+    default: assert(0);
   }
+
+  if (mbmi->mb_skip_coeff) {
+    if (!dry_run)
+      cpi->skip_true_count[mb_skip_context] += skip_inc;
+    if (!cm->mb_no_coeff_skip) {
+      vp9_stuff_sb64(cpi, xd, t, dry_run);
+    } else {
+      vp9_reset_sb64_tokens_context(xd);
+    }
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    cpi->skip_false_count[mb_skip_context] += skip_inc;
+
+  switch (mbmi->txfm_size) {
+    case TX_32X32:
+      for (b = 0; b < 256; b += 64)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
+                   TX_32X32, dry_run);
+      for (b = 256; b < 384; b += 64)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_32X32, dry_run);
+      break;
+    case TX_16X16:
+      for (b = 0; b < 256; b += 16)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
+                   TX_16X16, dry_run);
+      for (b = 256; b < 384; b += 16)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_16X16, dry_run);
+      break;
+    case TX_8X8:
+      for (b = 0; b < 256; b += 4)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
+                   TX_8X8, dry_run);
+      for (b = 256; b < 384; b += 4)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_8X8, dry_run);
+      break;
+    case TX_4X4:
+      for (b = 0; b < 256; b++)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
+                   TX_4X4, dry_run);
+      for (b = 256; b < 384; b++)
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+                   TX_4X4, dry_run);
+      break;
+    default: assert(0);
+  }
+
   if (dry_run)
     *t = t_backup;
 }
@@ -567,23 +863,23 @@
 
   /* print counts */
   print_counter(f, context_counters_4x4, BLOCK_TYPES,
-                "vp9_default_coef_counts_4x4[BLOCK_TYPES_4X4]");
+                "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
   print_counter(f, context_counters_8x8, BLOCK_TYPES,
-                "vp9_default_coef_counts_8x8[BLOCK_TYPES_8X8]");
+                "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
   print_counter(f, context_counters_16x16, BLOCK_TYPES,
-                "vp9_default_coef_counts_16x16[BLOCK_TYPES_16X16]");
-  print_counter(f, context_counters_32x32, BLOCK_TYPES_32X32,
-                "vp9_default_coef_counts_32x32[BLOCK_TYPES_32X32]");
+                "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
+  print_counter(f, context_counters_32x32, BLOCK_TYPES,
+                "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
 
   /* print coefficient probabilities */
   print_probs(f, context_counters_4x4, BLOCK_TYPES,
-              "default_coef_probs_4x4[BLOCK_TYPES_4X4]");
+              "default_coef_probs_4x4[BLOCK_TYPES]");
   print_probs(f, context_counters_8x8, BLOCK_TYPES,
-              "default_coef_probs_8x8[BLOCK_TYPES_8X8]");
+              "default_coef_probs_8x8[BLOCK_TYPES]");
   print_probs(f, context_counters_16x16, BLOCK_TYPES,
-              "default_coef_probs_16x16[BLOCK_TYPES_16X16]");
-  print_probs(f, context_counters_32x32, BLOCK_TYPES_32X32,
-              "default_coef_probs_32x32[BLOCK_TYPES_32X32]");
+              "default_coef_probs_16x16[BLOCK_TYPES]");
+  print_probs(f, context_counters_32x32, BLOCK_TYPES,
+              "default_coef_probs_32x32[BLOCK_TYPES]");
 
   fclose(f);
 
@@ -600,31 +896,49 @@
   fill_value_tokens();
 }
 
-static INLINE void stuff_b(VP9_COMP *cpi,
-                           MACROBLOCKD *xd,
-                           const int ib,
-                           TOKENEXTRA **tp,
-                           PLANE_TYPE type,
-                           TX_SIZE tx_size,
-                           int dry_run) {
+static void stuff_b(VP9_COMP *cpi,
+                    MACROBLOCKD *xd,
+                    const int ib,
+                    TOKENEXTRA **tp,
+                    PLANE_TYPE type,
+                    TX_SIZE tx_size,
+                    int dry_run) {
   vp9_coeff_count *counts;
   vp9_coeff_probs *probs;
   int pt, band;
   TOKENEXTRA *t = *tp;
-  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
-  ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +
-      vp9_block2above[tx_size][ib];
-  ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context +
-      vp9_block2left[tx_size][ib];
-  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
-  ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) +
-      vp9_block2above[tx_size][ib];
-  ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) +
-      vp9_block2left[tx_size][ib];
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const int ref = mbmi->ref_frame != INTRA_FRAME;
+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
 
+  if (sb_type == BLOCK_SIZE_SB32X32) {
+    a = (ENTROPY_CONTEXT *)xd->above_context +
+                                             vp9_block2above_sb64[tx_size][ib];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];
+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+  } else if (sb_type == BLOCK_SIZE_SB32X32) {
+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];
+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
+    a2 = l2 = a3 = l3 = NULL;
+  } else {
+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];
+    a1 = l1 = a2 = l2 = a3 = l3 = NULL;
+  }
+
   switch (tx_size) {
     default:
     case TX_4X4:
+      a_ec = a[0];
+      l_ec = l[0];
       counts = cpi->coef_counts_4x4;
       probs = cpi->common.fc.coef_probs_4x4;
       break;
@@ -646,12 +960,17 @@
       probs = cpi->common.fc.coef_probs_16x16;
       break;
     case TX_32X32:
-      a_ec = a[0] + a[1] + a[2] + a[3] +
-             a1[0] + a1[1] + a1[2] + a1[3];
-      l_ec = l[0] + l[1] + l[2] + l[3] +
-             l1[0] + l1[1] + l1[2] + l1[3];
-      a_ec = a_ec != 0;
-      l_ec = l_ec != 0;
+      if (type != PLANE_TYPE_UV) {
+        a_ec = (a[0] + a[1] + a[2] + a[3] +
+                a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3] +
+                l1[0] + l1[1] + l1[2] + l1[3]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a1[0] + a1[1] +
+                a2[0] + a2[1] + a3[0] + a3[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1] +
+                l2[0] + l2[1] + l3[0] + l3[1]) != 0;
+      }
       counts = cpi->coef_counts_32x32;
       probs = cpi->common.fc.coef_probs_32x32;
       break;
@@ -678,10 +997,17 @@
       l1[0] = l1[1] = l[1] = l_ec;
     }
   } else if (tx_size == TX_32X32) {
-    a[1] = a[2] = a[3] = a_ec;
-    l[1] = l[2] = l[3] = l_ec;
-    a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
-    l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
+    if (type != PLANE_TYPE_Y_WITH_DC) {
+      a[1] = a[2] = a[3] = a_ec;
+      l[1] = l[2] = l[3] = l_ec;
+      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
+      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
+    } else {
+      a[1] = a1[0] = a1[1] = a_ec;
+      l[1] = l1[0] = l1[1] = l_ec;
+      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;
+      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;
+    }
   }
 
   if (!dry_run) {
@@ -751,27 +1077,76 @@
   }
 }
 
-static void stuff_sb_32x32(VP9_COMP *cpi, MACROBLOCKD *xd,
-                               TOKENEXTRA **t, int dry_run) {
+void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
+  TOKENEXTRA * const t_backup = *t;
   int b;
 
-  stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);
-  for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_32X32:
+      stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);
+      for (b = 64; b < 96; b += 16)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
+      break;
+    case TX_16X16:
+      for (b = 0; b < 64; b += 16)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
+      for (b = 64; b < 96; b += 16)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
+      break;
+    case TX_8X8:
+      for (b = 0; b < 64; b += 4)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
+      for (b = 64; b < 96; b += 4)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
+      break;
+    case TX_4X4:
+      for (b = 0; b < 64; b++)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
+      for (b = 64; b < 96; b++)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
+      break;
+    default: assert(0);
   }
+
+  if (dry_run) {
+    *t = t_backup;
+  }
 }
 
-void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
+void vp9_stuff_sb64(VP9_COMP *cpi, MACROBLOCKD *xd,
+                    TOKENEXTRA **t, int dry_run) {
   TOKENEXTRA * const t_backup = *t;
+  int b;
 
-  stuff_sb_32x32(cpi, xd, t, dry_run);
+  switch (xd->mode_info_context->mbmi.txfm_size) {
+    case TX_32X32:
+      for (b = 0; b < 256; b += 64)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);
+      for (b = 256; b < 384; b += 64)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_32X32, dry_run);
+      break;
+    case TX_16X16:
+      for (b = 0; b < 256; b += 16)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
+      for (b = 256; b < 384; b += 16)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
+      break;
+    case TX_8X8:
+      for (b = 0; b < 256; b += 4)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
+      for (b = 256; b < 384; b += 4)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
+      break;
+    case TX_4X4:
+      for (b = 0; b < 256; b++)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
+      for (b = 256; b < 384; b++)
+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
+      break;
+    default: assert(0);
+  }
 
   if (dry_run) {
     *t = t_backup;
   }
-}
-
-void vp9_fix_contexts_sb(MACROBLOCKD *xd) {
-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
 }
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -38,6 +38,7 @@
 int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
 int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
 int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
+int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd);
 
 struct VP9_COMP;
 
@@ -45,14 +46,16 @@
                      TOKENEXTRA **t, int dry_run);
 void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                      TOKENEXTRA **t, int dry_run);
+void vp9_tokenize_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                       TOKENEXTRA **t, int dry_run);
 
 void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                   TOKENEXTRA **t, int dry_run);
 void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                   TOKENEXTRA **t, int dry_run);
+void vp9_stuff_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                    TOKENEXTRA **t, int dry_run);
 
-void vp9_fix_contexts_sb(MACROBLOCKD *xd);
-
 #ifdef ENTROPY_STATS
 void init_context_counters();
 void print_context_counters();
@@ -60,7 +63,7 @@
 extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];
 extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];
 extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];
-extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];
+extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];
 #endif
 
 extern const int *vp9_dct_value_cost_ptr;
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -2442,6 +2442,7 @@
     int y[2], u[2], v[2];
     find_mismatch(&stream->ref_enc.img, &stream->ref_dec.img,
                   y, u, v);
+    stream->decoder.err = 1;
     warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
                           "Stream %d: Encode/decode mismatch on frame %d"
                           " at Y[%d, %d], U[%d, %d], V[%d, %d]",