shithub: libvpx

--- a/vpx_dsp/inv_txfm.c

+++ b/vpx_dsp/inv_txfm.c

@@ -93,6 +93,42 @@

+void iadst4_c(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_low_t x0 = input[0];

+  tran_low_t x1 = input[1];

+  tran_low_t x2 = input[2];

+  tran_low_t x3 = input[3];

+  if (!(x0 | x1 | x2 | x3)) {

+    memset(output, 0, 4 * sizeof(*output));

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = WRAPLOW(x0 - x2 + x3);

+  s0 = s0 + s3 + s5;

+  s1 = s1 - s4 - s6;

+  s3 = s2;

+  s2 = sinpi_3_9 * s7;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));

+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));

+  output[2] = WRAPLOW(dct_const_round_shift(s2));

+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));

+}

 void idct4_c(const tran_low_t *input, tran_low_t *output) {

   tran_low_t step[4];

   tran_high_t temp1, temp2;

@@ -155,6 +191,81 @@

+void iadst8_c(const tran_low_t *input, tran_low_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_high_t x0 = input[7];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[5];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[3];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[1];

+  tran_high_t x7 = input[6];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    memset(output, 0, 8 * sizeof(*output));

+    return;

+  }

+  // stage 1

+  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);

+  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);

+  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);

+  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);

+  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);

+  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);

+  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);

+  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);

+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));

+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));

+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));

+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));

+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));

+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));

+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));

+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));

+  // stage 2

+  s0 = (int)x0;

+  s1 = (int)x1;

+  s2 = (int)x2;

+  s3 = (int)x3;

+  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);

+  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);

+  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);

+  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);

+  x0 = WRAPLOW(s0 + s2);

+  x1 = WRAPLOW(s1 + s3);

+  x2 = WRAPLOW(s0 - s2);

+  x3 = WRAPLOW(s1 - s3);

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));

+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));

+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));

+  // stage 3

+  s2 = (int)(cospi_16_64 * (x2 + x3));

+  s3 = (int)(cospi_16_64 * (x2 - x3));

+  s6 = (int)(cospi_16_64 * (x6 + x7));

+  s7 = (int)(cospi_16_64 * (x6 - x7));

+  x2 = WRAPLOW(dct_const_round_shift(s2));

+  x3 = WRAPLOW(dct_const_round_shift(s3));

+  x6 = WRAPLOW(dct_const_round_shift(s6));

+  x7 = WRAPLOW(dct_const_round_shift(s7));

+  output[0] = WRAPLOW(x0);

+  output[1] = WRAPLOW(-x4);

+  output[2] = WRAPLOW(x6);

+  output[3] = WRAPLOW(-x2);

+  output[4] = WRAPLOW(x3);

+  output[5] = WRAPLOW(-x7);

+  output[6] = WRAPLOW(x5);

+  output[7] = WRAPLOW(-x1);

+}

 void idct8_c(const tran_low_t *input, tran_low_t *output) {

   tran_low_t step1[8], step2[8];

   tran_high_t temp1, temp2;

@@ -234,6 +345,31 @@

+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  int i, j;

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[8], temp_out[8];

+  // First transform rows

+  // Only first 4 row has non-zero coefs

+  for (i = 0; i < 4; ++i) {

+    idct8_c(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

+    idct8_c(temp_in, temp_out);

+    for (j = 0; j < 8; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));

+    }

+  }

+}

 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

   int i, j;

   tran_high_t a1;

@@ -247,87 +383,120 @@

-void iadst4_c(const tran_low_t *input, tran_low_t *output) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_low_t x0 = input[0];

-  tran_low_t x1 = input[1];

-  tran_low_t x2 = input[2];

-  tran_low_t x3 = input[3];

-  if (!(x0 | x1 | x2 | x3)) {

-    memset(output, 0, 4 * sizeof(*output));

-    return;

-  }

-  s0 = sinpi_1_9 * x0;

-  s1 = sinpi_2_9 * x0;

-  s2 = sinpi_3_9 * x1;

-  s3 = sinpi_4_9 * x2;

-  s4 = sinpi_1_9 * x2;

-  s5 = sinpi_2_9 * x3;

-  s6 = sinpi_4_9 * x3;

-  s7 = WRAPLOW(x0 - x2 + x3);

-  s0 = s0 + s3 + s5;

-  s1 = s1 - s4 - s6;

-  s3 = s2;

-  s2 = sinpi_3_9 * s7;

-  // 1-D transform scaling factor is sqrt(2).

-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

-  // + 1b (addition) = 29b.

-  // Hence the output bit depth is 15b.

-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));

-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));

-  output[2] = WRAPLOW(dct_const_round_shift(s2));

-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));

-}

-void iadst8_c(const tran_low_t *input, tran_low_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_high_t x0 = input[7];

+void iadst16_c(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

+  tran_high_t s9, s10, s11, s12, s13, s14, s15;

+  tran_high_t x0 = input[15];

   tran_high_t x1 = input[0];

-  tran_high_t x2 = input[5];

+  tran_high_t x2 = input[13];

   tran_high_t x3 = input[2];

-  tran_high_t x4 = input[3];

+  tran_high_t x4 = input[11];

   tran_high_t x5 = input[4];

-  tran_high_t x6 = input[1];

+  tran_high_t x6 = input[9];

   tran_high_t x7 = input[6];

+  tran_high_t x8 = input[7];

+  tran_high_t x9 = input[8];

+  tran_high_t x10 = input[5];

+  tran_high_t x11 = input[10];

+  tran_high_t x12 = input[3];

+  tran_high_t x13 = input[12];

+  tran_high_t x14 = input[1];

+  tran_high_t x15 = input[14];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

-    memset(output, 0, 8 * sizeof(*output));

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |

+        x13 | x14 | x15)) {

+    memset(output, 0, 16 * sizeof(*output));

     return;

   // stage 1

-  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);

-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);

-  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);

-  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);

-  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);

-  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);

-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);

-  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);

+  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));

-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));

-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));

-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));

-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));

-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));

-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));

-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));

+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));

+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));

+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));

+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));

+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));

+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));

+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));

+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));

+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));

+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));

+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));

+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));

+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));

+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));

   // stage 2

-  s0 = (int)x0;

-  s1 = (int)x1;

-  s2 = (int)x2;

-  s3 = (int)x3;

-  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);

-  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);

-  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);

-  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = WRAPLOW(s0 + s4);

+  x1 = WRAPLOW(s1 + s5);

+  x2 = WRAPLOW(s2 + s6);

+  x3 = WRAPLOW(s3 + s7);

+  x4 = WRAPLOW(s0 - s4);

+  x5 = WRAPLOW(s1 - s5);

+  x6 = WRAPLOW(s2 - s6);

+  x7 = WRAPLOW(s3 - s7);

+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));

+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));

+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));

+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));

+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));

+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));

+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));

+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

   x0 = WRAPLOW(s0 + s2);

   x1 = WRAPLOW(s1 + s3);

   x2 = WRAPLOW(s0 - s2);

@@ -336,53 +505,52 @@

   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));

   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));

   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));

+  x8 = WRAPLOW(s8 + s10);

+  x9 = WRAPLOW(s9 + s11);

+  x10 = WRAPLOW(s8 - s10);

+  x11 = WRAPLOW(s9 - s11);

+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));

+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));

+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));

+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));

-  // stage 3

-  s2 = (int)(cospi_16_64 * (x2 + x3));

-  s3 = (int)(cospi_16_64 * (x2 - x3));

-  s6 = (int)(cospi_16_64 * (x6 + x7));

-  s7 = (int)(cospi_16_64 * (x6 - x7));

+  // stage 4

+  s2 = (-cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (-x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (-x10 + x11);

+  s14 = (-cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

   x2 = WRAPLOW(dct_const_round_shift(s2));

   x3 = WRAPLOW(dct_const_round_shift(s3));

   x6 = WRAPLOW(dct_const_round_shift(s6));

   x7 = WRAPLOW(dct_const_round_shift(s7));

+  x10 = WRAPLOW(dct_const_round_shift(s10));

+  x11 = WRAPLOW(dct_const_round_shift(s11));

+  x14 = WRAPLOW(dct_const_round_shift(s14));

+  x15 = WRAPLOW(dct_const_round_shift(s15));

   output[0] = WRAPLOW(x0);

-  output[1] = WRAPLOW(-x4);

-  output[2] = WRAPLOW(x6);

-  output[3] = WRAPLOW(-x2);

-  output[4] = WRAPLOW(x3);

-  output[5] = WRAPLOW(-x7);

-  output[6] = WRAPLOW(x5);

-  output[7] = WRAPLOW(-x1);

+  output[1] = WRAPLOW(-x8);

+  output[2] = WRAPLOW(x12);

+  output[3] = WRAPLOW(-x4);

+  output[4] = WRAPLOW(x6);

+  output[5] = WRAPLOW(x14);

+  output[6] = WRAPLOW(x10);

+  output[7] = WRAPLOW(x2);

+  output[8] = WRAPLOW(x3);

+  output[9] = WRAPLOW(x11);

+  output[10] = WRAPLOW(x15);

+  output[11] = WRAPLOW(x7);

+  output[12] = WRAPLOW(x5);

+  output[13] = WRAPLOW(-x13);

+  output[14] = WRAPLOW(x9);

+  output[15] = WRAPLOW(-x1);

-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-  int i, j;

-  tran_low_t out[8 * 8] = { 0 };

-  tran_low_t *outptr = out;

-  tran_low_t temp_in[8], temp_out[8];

-  // First transform rows

-  // Only first 4 row has non-zero coefs

-  for (i = 0; i < 4; ++i) {

-    idct8_c(input, outptr);

-    input += 8;

-    outptr += 8;

-  }

-  // Then transform columns

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

-    idct8_c(temp_in, temp_out);

-    for (j = 0; j < 8; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));

-    }

-  }

-}

 void idct16_c(const tran_low_t *input, tran_low_t *output) {

   tran_low_t step1[16], step2[16];

   tran_high_t temp1, temp2;

@@ -573,175 +741,7 @@

-void iadst16_c(const tran_low_t *input, tran_low_t *output) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

-  tran_high_t s9, s10, s11, s12, s13, s14, s15;

-  tran_high_t x0 = input[15];

-  tran_high_t x1 = input[0];

-  tran_high_t x2 = input[13];

-  tran_high_t x3 = input[2];

-  tran_high_t x4 = input[11];

-  tran_high_t x5 = input[4];

-  tran_high_t x6 = input[9];

-  tran_high_t x7 = input[6];

-  tran_high_t x8 = input[7];

-  tran_high_t x9 = input[8];

-  tran_high_t x10 = input[5];

-  tran_high_t x11 = input[10];

-  tran_high_t x12 = input[3];

-  tran_high_t x13 = input[12];

-  tran_high_t x14 = input[1];

-  tran_high_t x15 = input[14];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |

-        x13 | x14 | x15)) {

-    memset(output, 0, 16 * sizeof(*output));

-    return;

-  }

-  // stage 1

-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;

-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;

-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;

-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));

-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));

-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));

-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));

-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));

-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));

-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));

-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));

-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));

-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));

-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));

-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));

-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));

-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));

-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));

-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4;

-  s5 = x5;

-  s6 = x6;

-  s7 = x7;

-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

-  x0 = WRAPLOW(s0 + s4);

-  x1 = WRAPLOW(s1 + s5);

-  x2 = WRAPLOW(s2 + s6);

-  x3 = WRAPLOW(s3 + s7);

-  x4 = WRAPLOW(s0 - s4);

-  x5 = WRAPLOW(s1 - s5);

-  x6 = WRAPLOW(s2 - s6);

-  x7 = WRAPLOW(s3 - s7);

-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));

-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));

-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));

-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));

-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));

-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));

-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));

-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));

-  // stage 3

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

-  s8 = x8;

-  s9 = x9;

-  s10 = x10;

-  s11 = x11;

-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

-  x0 = WRAPLOW(s0 + s2);

-  x1 = WRAPLOW(s1 + s3);

-  x2 = WRAPLOW(s0 - s2);

-  x3 = WRAPLOW(s1 - s3);

-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));

-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));

-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));

-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));

-  x8 = WRAPLOW(s8 + s10);

-  x9 = WRAPLOW(s9 + s11);

-  x10 = WRAPLOW(s8 - s10);

-  x11 = WRAPLOW(s9 - s11);

-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));

-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));

-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));

-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));

-  // stage 4

-  s2 = (-cospi_16_64) * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (-x6 + x7);

-  s10 = cospi_16_64 * (x10 + x11);

-  s11 = cospi_16_64 * (-x10 + x11);

-  s14 = (-cospi_16_64) * (x14 + x15);

-  s15 = cospi_16_64 * (x14 - x15);

-  x2 = WRAPLOW(dct_const_round_shift(s2));

-  x3 = WRAPLOW(dct_const_round_shift(s3));

-  x6 = WRAPLOW(dct_const_round_shift(s6));

-  x7 = WRAPLOW(dct_const_round_shift(s7));

-  x10 = WRAPLOW(dct_const_round_shift(s10));

-  x11 = WRAPLOW(dct_const_round_shift(s11));

-  x14 = WRAPLOW(dct_const_round_shift(s14));

-  x15 = WRAPLOW(dct_const_round_shift(s15));

-  output[0] = WRAPLOW(x0);

-  output[1] = WRAPLOW(-x8);

-  output[2] = WRAPLOW(x12);

-  output[3] = WRAPLOW(-x4);

-  output[4] = WRAPLOW(x6);

-  output[5] = WRAPLOW(x14);

-  output[6] = WRAPLOW(x10);

-  output[7] = WRAPLOW(x2);

-  output[8] = WRAPLOW(x3);

-  output[9] = WRAPLOW(x11);

-  output[10] = WRAPLOW(x15);

-  output[11] = WRAPLOW(x7);

-  output[12] = WRAPLOW(x5);

-  output[13] = WRAPLOW(-x13);

-  output[14] = WRAPLOW(x9);

-  output[15] = WRAPLOW(-x1);

-}

-void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,

+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,

                             int stride) {

   int i, j;

   tran_low_t out[16 * 16] = { 0 };

@@ -749,8 +749,8 @@

   tran_low_t temp_in[16], temp_out[16];

   // First transform rows. Since all non-zero dct coefficients are in

-  // upper-left 4x4 area, we only need to calculate first 4 rows here.

-  for (i = 0; i < 4; ++i) {

+  // upper-left 8x8 area, we only need to calculate first 8 rows here.

+  for (i = 0; i < 8; ++i) {

     idct16_c(input, outptr);

     input += 16;

     outptr += 16;

@@ -767,7 +767,7 @@

-void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,

+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,

                             int stride) {

   int i, j;

   tran_low_t out[16 * 16] = { 0 };

@@ -775,8 +775,8 @@

   tran_low_t temp_in[16], temp_out[16];

   // First transform rows. Since all non-zero dct coefficients are in

-  // upper-left 8x8 area, we only need to calculate first 8 rows here.

-  for (i = 0; i < 8; ++i) {

+  // upper-left 4x4 area, we only need to calculate first 4 rows here.

+  for (i = 0; i < 4; ++i) {

     idct16_c(input, outptr);

     input += 16;

     outptr += 16;

@@ -1377,6 +1377,51 @@

+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_low_t x0 = input[0];

+  tran_low_t x1 = input[1];

+  tran_low_t x2 = input[2];

+  tran_low_t x3 = input[3];

+  (void)bd;

+  if (detect_invalid_highbd_input(input, 4)) {

+#if CONFIG_COEFFICIENT_RANGE_CHECKING

+    assert(0 && "invalid highbd txfm input");

+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

+    memset(output, 0, sizeof(*output) * 4);

+    return;

+  }

+  if (!(x0 | x1 | x2 | x3)) {

+    memset(output, 0, 4 * sizeof(*output));

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);

+  s0 = s0 + s3 + s5;

+  s1 = s1 - s4 - s6;

+  s3 = s2;

+  s2 = sinpi_3_9 * s7;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);

+  output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);

+  output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);

+  output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);

+}

 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {

   tran_low_t step[4];

   tran_high_t temp1, temp2;

@@ -1453,6 +1498,90 @@

+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_low_t x0 = input[7];

+  tran_low_t x1 = input[0];

+  tran_low_t x2 = input[5];

+  tran_low_t x3 = input[2];

+  tran_low_t x4 = input[3];

+  tran_low_t x5 = input[4];

+  tran_low_t x6 = input[1];

+  tran_low_t x7 = input[6];

+  (void)bd;

+  if (detect_invalid_highbd_input(input, 8)) {

+#if CONFIG_COEFFICIENT_RANGE_CHECKING

+    assert(0 && "invalid highbd txfm input");

+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

+    memset(output, 0, sizeof(*output) * 8);

+    return;

+  }

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    memset(output, 0, 8 * sizeof(*output));

+    return;

+  }

+  // stage 1

+  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

+  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

+  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

+  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);

+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);

+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);

+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);

+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);

+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);

+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);

+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

+  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

+  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

+  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);

+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);

+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);

+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);

+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);

+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);

+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);

+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);

+  // stage 3

+  s2 = cospi_16_64 * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (x6 - x7);

+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);

+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);

+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);

+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);

+  output[0] = HIGHBD_WRAPLOW(x0, bd);

+  output[1] = HIGHBD_WRAPLOW(-x4, bd);

+  output[2] = HIGHBD_WRAPLOW(x6, bd);

+  output[3] = HIGHBD_WRAPLOW(-x2, bd);

+  output[4] = HIGHBD_WRAPLOW(x3, bd);

+  output[5] = HIGHBD_WRAPLOW(-x7, bd);

+  output[6] = HIGHBD_WRAPLOW(x5, bd);

+  output[7] = HIGHBD_WRAPLOW(-x1, bd);

+}

 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {

   tran_low_t step1[8], step2[8];

   tran_high_t temp1, temp2;

@@ -1533,6 +1662,33 @@

+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int bd) {

+  int i, j;

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[8], temp_out[8];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows

+  // Only first 4 row has non-zero coefs

+  for (i = 0; i < 4; ++i) {

+    vpx_highbd_idct8_c(input, outptr, bd);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

+    vpx_highbd_idct8_c(temp_in, temp_out, bd);

+    for (j = 0; j < 8; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+    }

+  }

+}

 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,

                                 int stride, int bd) {

   int i, j;

@@ -1549,94 +1705,75 @@

-void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_low_t x0 = input[0];

-  tran_low_t x1 = input[1];

-  tran_low_t x2 = input[2];

-  tran_low_t x3 = input[3];

-  (void)bd;

-  if (detect_invalid_highbd_input(input, 4)) {

-#if CONFIG_COEFFICIENT_RANGE_CHECKING

-    assert(0 && "invalid highbd txfm input");

-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

-    memset(output, 0, sizeof(*output) * 4);

-    return;

-  }

-  if (!(x0 | x1 | x2 | x3)) {

-    memset(output, 0, 4 * sizeof(*output));

-    return;

-  }

-  s0 = sinpi_1_9 * x0;

-  s1 = sinpi_2_9 * x0;

-  s2 = sinpi_3_9 * x1;

-  s3 = sinpi_4_9 * x2;

-  s4 = sinpi_1_9 * x2;

-  s5 = sinpi_2_9 * x3;

-  s6 = sinpi_4_9 * x3;

-  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);

-  s0 = s0 + s3 + s5;

-  s1 = s1 - s4 - s6;

-  s3 = s2;

-  s2 = sinpi_3_9 * s7;

-  // 1-D transform scaling factor is sqrt(2).

-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

-  // + 1b (addition) = 29b.

-  // Hence the output bit depth is 15b.

-  output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);

-  output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);

-  output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);

-  output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);

-}

-void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_low_t x0 = input[7];

+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

+  tran_high_t s9, s10, s11, s12, s13, s14, s15;

+  tran_low_t x0 = input[15];

   tran_low_t x1 = input[0];

-  tran_low_t x2 = input[5];

+  tran_low_t x2 = input[13];

   tran_low_t x3 = input[2];

-  tran_low_t x4 = input[3];

+  tran_low_t x4 = input[11];

   tran_low_t x5 = input[4];

-  tran_low_t x6 = input[1];

+  tran_low_t x6 = input[9];

   tran_low_t x7 = input[6];

+  tran_low_t x8 = input[7];

+  tran_low_t x9 = input[8];

+  tran_low_t x10 = input[5];

+  tran_low_t x11 = input[10];

+  tran_low_t x12 = input[3];

+  tran_low_t x13 = input[12];

+  tran_low_t x14 = input[1];

+  tran_low_t x15 = input[14];

   (void)bd;

-  if (detect_invalid_highbd_input(input, 8)) {

+  if (detect_invalid_highbd_input(input, 16)) {

 #if CONFIG_COEFFICIENT_RANGE_CHECKING

     assert(0 && "invalid highbd txfm input");

 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

-    memset(output, 0, sizeof(*output) * 8);

+    memset(output, 0, sizeof(*output) * 16);

     return;

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

-    memset(output, 0, 8 * sizeof(*output));

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |

+        x13 | x14 | x15)) {

+    memset(output, 0, 16 * sizeof(*output));

     return;

   // stage 1

-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

+  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

-  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);

-  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);

-  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);

-  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);

-  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);

-  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);

-  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);

-  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);

+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);

+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);

+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);

+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);

+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);

+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);

+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);

+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);

+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);

+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);

+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);

+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);

+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);

+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);

+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);

+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);

   // stage 2

   s0 = x0;

@@ -1643,11 +1780,54 @@

   s1 = x1;

   s2 = x2;

   s3 = x3;

-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);

+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);

+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);

+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);

+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);

+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);

+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);

+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);

+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);

+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);

+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);

+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);

+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);

+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);

+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);

+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);

   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);

   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);

@@ -1656,55 +1836,52 @@

   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);

   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);

   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);

+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);

+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);

+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);

+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);

+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);

+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);

+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);

+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);

-  // stage 3

-  s2 = cospi_16_64 * (x2 + x3);

+  // stage 4

+  s2 = (-cospi_16_64) * (x2 + x3);

   s3 = cospi_16_64 * (x2 - x3);

   s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (x6 - x7);

+  s7 = cospi_16_64 * (-x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (-x10 + x11);

+  s14 = (-cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);

   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);

   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);

   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);

+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);

+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);

+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);

+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);

   output[0] = HIGHBD_WRAPLOW(x0, bd);

-  output[1] = HIGHBD_WRAPLOW(-x4, bd);

-  output[2] = HIGHBD_WRAPLOW(x6, bd);

-  output[3] = HIGHBD_WRAPLOW(-x2, bd);

-  output[4] = HIGHBD_WRAPLOW(x3, bd);

-  output[5] = HIGHBD_WRAPLOW(-x7, bd);

-  output[6] = HIGHBD_WRAPLOW(x5, bd);

-  output[7] = HIGHBD_WRAPLOW(-x1, bd);

+  output[1] = HIGHBD_WRAPLOW(-x8, bd);

+  output[2] = HIGHBD_WRAPLOW(x12, bd);

+  output[3] = HIGHBD_WRAPLOW(-x4, bd);

+  output[4] = HIGHBD_WRAPLOW(x6, bd);

+  output[5] = HIGHBD_WRAPLOW(x14, bd);

+  output[6] = HIGHBD_WRAPLOW(x10, bd);

+  output[7] = HIGHBD_WRAPLOW(x2, bd);

+  output[8] = HIGHBD_WRAPLOW(x3, bd);

+  output[9] = HIGHBD_WRAPLOW(x11, bd);

+  output[10] = HIGHBD_WRAPLOW(x15, bd);

+  output[11] = HIGHBD_WRAPLOW(x7, bd);

+  output[12] = HIGHBD_WRAPLOW(x5, bd);

+  output[13] = HIGHBD_WRAPLOW(-x13, bd);

+  output[14] = HIGHBD_WRAPLOW(x9, bd);

+  output[15] = HIGHBD_WRAPLOW(-x1, bd);

-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,

-                                 int stride, int bd) {

-  int i, j;

-  tran_low_t out[8 * 8] = { 0 };

-  tran_low_t *outptr = out;

-  tran_low_t temp_in[8], temp_out[8];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // First transform rows

-  // Only first 4 row has non-zero coefs

-  for (i = 0; i < 4; ++i) {

-    vpx_highbd_idct8_c(input, outptr, bd);

-    input += 8;

-    outptr += 8;

-  }

-  // Then transform columns

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

-    vpx_highbd_idct8_c(temp_in, temp_out, bd);

-    for (j = 0; j < 8; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-    }

-  }

-}

 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {

   tran_low_t step1[16], step2[16];

   tran_high_t temp1, temp2;

@@ -1903,183 +2080,6 @@

           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-}

-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

-  tran_high_t s9, s10, s11, s12, s13, s14, s15;

-  tran_low_t x0 = input[15];

-  tran_low_t x1 = input[0];

-  tran_low_t x2 = input[13];

-  tran_low_t x3 = input[2];

-  tran_low_t x4 = input[11];

-  tran_low_t x5 = input[4];

-  tran_low_t x6 = input[9];

-  tran_low_t x7 = input[6];

-  tran_low_t x8 = input[7];

-  tran_low_t x9 = input[8];

-  tran_low_t x10 = input[5];

-  tran_low_t x11 = input[10];

-  tran_low_t x12 = input[3];

-  tran_low_t x13 = input[12];

-  tran_low_t x14 = input[1];

-  tran_low_t x15 = input[14];

-  (void)bd;

-  if (detect_invalid_highbd_input(input, 16)) {

-#if CONFIG_COEFFICIENT_RANGE_CHECKING

-    assert(0 && "invalid highbd txfm input");

-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

-    memset(output, 0, sizeof(*output) * 16);

-    return;

-  }

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |

-        x13 | x14 | x15)) {

-    memset(output, 0, 16 * sizeof(*output));

-    return;

-  }

-  // stage 1

-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;

-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;

-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;

-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

-  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);

-  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);

-  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);

-  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);

-  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);

-  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);

-  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);

-  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);

-  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);

-  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);

-  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);

-  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);

-  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);

-  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);

-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);

-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4;

-  s5 = x5;

-  s6 = x6;

-  s7 = x7;

-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

-  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);

-  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);

-  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);

-  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);

-  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);

-  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);

-  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);

-  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);

-  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);

-  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);

-  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);

-  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);

-  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);

-  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);

-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);

-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);

-  // stage 3

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

-  s8 = x8;

-  s9 = x9;

-  s10 = x10;

-  s11 = x11;

-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

-  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);

-  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);

-  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);

-  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);

-  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);

-  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);

-  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);

-  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);

-  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);

-  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);

-  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);

-  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);

-  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);

-  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);

-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);

-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);

-  // stage 4

-  s2 = (-cospi_16_64) * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (-x6 + x7);

-  s10 = cospi_16_64 * (x10 + x11);

-  s11 = cospi_16_64 * (-x10 + x11);

-  s14 = (-cospi_16_64) * (x14 + x15);

-  s15 = cospi_16_64 * (x14 - x15);

-  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);

-  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);

-  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);

-  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);

-  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);

-  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);

-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);

-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);

-  output[0] = HIGHBD_WRAPLOW(x0, bd);

-  output[1] = HIGHBD_WRAPLOW(-x8, bd);

-  output[2] = HIGHBD_WRAPLOW(x12, bd);

-  output[3] = HIGHBD_WRAPLOW(-x4, bd);

-  output[4] = HIGHBD_WRAPLOW(x6, bd);

-  output[5] = HIGHBD_WRAPLOW(x14, bd);

-  output[6] = HIGHBD_WRAPLOW(x10, bd);

-  output[7] = HIGHBD_WRAPLOW(x2, bd);

-  output[8] = HIGHBD_WRAPLOW(x3, bd);

-  output[9] = HIGHBD_WRAPLOW(x11, bd);

-  output[10] = HIGHBD_WRAPLOW(x15, bd);

-  output[11] = HIGHBD_WRAPLOW(x7, bd);

-  output[12] = HIGHBD_WRAPLOW(x5, bd);

-  output[13] = HIGHBD_WRAPLOW(-x13, bd);

-  output[14] = HIGHBD_WRAPLOW(x9, bd);

-  output[15] = HIGHBD_WRAPLOW(-x1, bd);

 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,