shithub: libvpx

--- a/configure

+++ b/configure

@@ -281,6 +281,7 @@

     spatial_svc

     vp9_temporal_denoising

     fp_mb_stats

+    emulate_hardware_highbitdepth

 CONFIG_LIST="

     external_build

--- a/test/convolve_test.cc

+++ b/test/convolve_test.cc

@@ -581,6 +581,8 @@

 using std::tr1::make_tuple;

+#if CONFIG_VP9_HIGHBITDEPTH

+#else

 const ConvolveFunctions convolve8_c(

     vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,

     vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,

@@ -600,8 +602,11 @@

     make_tuple(64, 32, &convolve8_c),

     make_tuple(32, 64, &convolve8_c),

     make_tuple(64, 64, &convolve8_c)));

+#endif

-#if HAVE_SSE2

+#if HAVE_SSE2 && ARCH_X86_64

+#if CONFIG_VP9_HIGHBITDEPTH

+#else

 const ConvolveFunctions convolve8_sse2(

     vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,

     vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,

@@ -621,6 +626,7 @@

     make_tuple(64, 32, &convolve8_sse2),

     make_tuple(32, 64, &convolve8_sse2),

     make_tuple(64, 64, &convolve8_sse2)));

+#endif

 #endif

 #if HAVE_SSSE3

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -20,12 +20,9 @@

 #include "./vp9_rtcd.h"

 #include "vp9/common/vp9_entropy.h"

+#include "vpx/vpx_codec.h"

 #include "vpx/vpx_integer.h"

-extern "C" {

-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);

-}

 using libvpx_test::ACMRandom;

 namespace {

@@ -258,42 +255,72 @@

-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);

-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);

-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,

+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);

+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);

+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,

                         int tx_type);

-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,

+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

                         int tx_type);

-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param;

-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param;

+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;

+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;

-void fdct16x16_ref(const int16_t *in, int16_t *out, int stride,

+void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,

                    int /*tx_type*/) {

   vp9_fdct16x16_c(in, out, stride);

-void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride,

+void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,

                    int /*tx_type*/) {

   vp9_idct16x16_256_add_c(in, dest, stride);

-void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {

+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,

+                  int tx_type) {

   vp9_fht16x16_c(in, out, stride, tx_type);

-void iht16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {

+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,

+                  int tx_type) {

   vp9_iht16x16_256_add_c(in, dest, stride, tx_type);

+#if CONFIG_VP9_HIGHBITDEPTH

+void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct16x16_256_add_c(in, out, stride, 10);

+}

+void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct16x16_256_add_c(in, out, stride, 12);

+}

+void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,

+                      int tx_type) {

+  idct16x16_10(in, out, stride);

+}

+void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,

+                      int tx_type) {

+  idct16x16_12(in, out, stride);

+}

+void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

+  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 10);

+}

+void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

+  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 12);

+}

+#endif

 class Trans16x16TestBase {

  public:

   virtual ~Trans16x16TestBase() {}

  protected:

-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;

+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;

-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;

+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;

   void RunAccuracyCheck() {

     ACMRandom rnd(ACMRandom::DeterministicSeed());

@@ -302,23 +329,48 @@

     const int count_test_block = 10000;

     for (int i = 0; i < count_test_block; ++i) {

       DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);

-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);

+      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);

       DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

       DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);

+#endif

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < kNumCoeffs; ++j) {

-        src[j] = rnd.Rand8();

-        dst[j] = rnd.Rand8();

-        test_input_block[j] = src[j] - dst[j];

+        if (bit_depth_ == VPX_BITS_8) {

+          src[j] = rnd.Rand8();

+          dst[j] = rnd.Rand8();

+          test_input_block[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+        } else {

+          src16[j] = rnd.Rand16() & mask_;

+          dst16[j] = rnd.Rand16() & mask_;

+          test_input_block[j] = src16[j] - dst16[j];

+#endif

+        }

       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,

                                           test_temp_block, pitch_));

-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));

+      if (bit_depth_ == VPX_BITS_8) {

+        ASM_REGISTER_STATE_CHECK(

+            RunInvTxfm(test_temp_block, dst, pitch_));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        ASM_REGISTER_STATE_CHECK(

+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+#endif

+      }

       for (int j = 0; j < kNumCoeffs; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+        const uint32_t diff =

+            bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];

+#else

         const uint32_t diff = dst[j] - src[j];

+#endif

         const uint32_t error = diff * diff;

         if (max_error < error)

           max_error = error;

@@ -326,10 +378,10 @@

-    EXPECT_GE(1u, max_error)

+    EXPECT_GE(1u  << 2 * (bit_depth_ - 8), max_error)

         << "Error: 16x16 FHT/IHT has an individual round trip error > 1";

-    EXPECT_GE(count_test_block , total_error)

+    EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)

         << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";

@@ -337,13 +389,13 @@

     ACMRandom rnd(ACMRandom::DeterministicSeed());

     const int count_test_block = 1000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);

     for (int i = 0; i < count_test_block; ++i) {

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < kNumCoeffs; ++j)

-        input_block[j] = rnd.Rand8() - rnd.Rand8();

+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);

       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);

       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));

@@ -359,21 +411,21 @@

     const int count_test_block = 1000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);

     for (int i = 0; i < count_test_block; ++i) {

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < kNumCoeffs; ++j) {

-        input_block[j] = rnd.Rand8() - rnd.Rand8();

-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;

+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);

+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;

       if (i == 0) {

         for (int j = 0; j < kNumCoeffs; ++j)

-          input_extreme_block[j] = 255;

+          input_extreme_block[j] = mask_;

       } else if (i == 1) {

         for (int j = 0; j < kNumCoeffs; ++j)

-          input_extreme_block[j] = -255;

+          input_extreme_block[j] = -mask_;

       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);

@@ -383,7 +435,7 @@

       // The minimum quant value is 4.

       for (int j = 0; j < kNumCoeffs; ++j) {

         EXPECT_EQ(output_block[j], output_ref_block[j]);

-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))

+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))

             << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";

@@ -394,23 +446,30 @@

     const int count_test_block = 1000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);

+#endif

     for (int i = 0; i < count_test_block; ++i) {

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < kNumCoeffs; ++j) {

-        input_block[j] = rnd.Rand8() - rnd.Rand8();

-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;

+        if (bit_depth_ == VPX_BITS_8)

+          input_block[j] = rnd.Rand8() - rnd.Rand8();

+        else

+          input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);

+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;

       if (i == 0)

         for (int j = 0; j < kNumCoeffs; ++j)

-          input_extreme_block[j] = 255;

+          input_extreme_block[j] = mask_;

       if (i == 1)

         for (int j = 0; j < kNumCoeffs; ++j)

-          input_extreme_block[j] = -255;

+          input_extreme_block[j] = -mask_;

       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);

@@ -417,16 +476,35 @@

       // clear reconstructed pixel buffers

       vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));

       vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));

+#if CONFIG_VP9_HIGHBITDEPTH

+      vpx_memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));

+      vpx_memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));

+#endif

       // quantization with maximum allowed step sizes

       output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;

       for (int j = 1; j < kNumCoeffs; ++j)

         output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;

-      inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);

-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));

-      for (int j = 0; j < kNumCoeffs; ++j)

-        EXPECT_EQ(ref[j], dst[j]);

+      if (bit_depth_ == VPX_BITS_8) {

+        inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,

+                     tx_type_);

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block,

+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));

+#endif

+      }

+      if (bit_depth_ == VPX_BITS_8) {

+        for (int j = 0; j < kNumCoeffs; ++j)

+          EXPECT_EQ(ref[j], dst[j]);

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        for (int j = 0; j < kNumCoeffs; ++j)

+          EXPECT_EQ(ref16[j], dst16[j]);

+#endif

+      }

@@ -434,9 +512,13 @@

     ACMRandom rnd(ACMRandom::DeterministicSeed());

     const int count_test_block = 1000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);

+#endif

     for (int i = 0; i < count_test_block; ++i) {

       double out_r[kNumCoeffs];

@@ -443,9 +525,17 @@

       // Initialize a test block with input range [-255, 255].

       for (int j = 0; j < kNumCoeffs; ++j) {

-        src[j] = rnd.Rand8();

-        dst[j] = rnd.Rand8();

-        in[j] = src[j] - dst[j];

+        if (bit_depth_ == VPX_BITS_8) {

+          src[j] = rnd.Rand8();

+          dst[j] = rnd.Rand8();

+          in[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+        } else {

+          src16[j] = rnd.Rand16() & mask_;

+          dst16[j] = rnd.Rand16() & mask_;

+          in[j] = src16[j] - dst16[j];

+#endif

+        }

       reference_16x16_dct_2d(in, out_r);

@@ -452,10 +542,22 @@

       for (int j = 0; j < kNumCoeffs; ++j)

         coeff[j] = round(out_r[j]);

-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));

+      if (bit_depth_ == VPX_BITS_8) {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),

+                                            16));

+#endif

+      }

       for (int j = 0; j < kNumCoeffs; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+        const uint32_t diff =

+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

         const uint32_t diff = dst[j] - src[j];

+#endif

         const uint32_t error = diff * diff;

         EXPECT_GE(1u, error)

             << "Error: 16x16 IDCT has error " << error

@@ -465,6 +567,8 @@

   int pitch_;

   int tx_type_;

+  vpx_bit_depth_t bit_depth_;

+  int mask_;

   FhtFunc fwd_txfm_ref;

   IhtFunc inv_txfm_ref;

};

@@ -479,17 +583,34 @@

     fwd_txfm_ = GET_PARAM(0);

     inv_txfm_ = GET_PARAM(1);

     tx_type_  = GET_PARAM(2);

+    bit_depth_ = GET_PARAM(3);

     pitch_    = 16;

     fwd_txfm_ref = fdct16x16_ref;

     inv_txfm_ref = idct16x16_ref;

+    mask_ = (1 << bit_depth_) - 1;

+#if CONFIG_VP9_HIGHBITDEPTH

+    switch (bit_depth_) {

+      case 10:

+        inv_txfm_ref = idct16x16_10_ref;

+        break;

+      case 12:

+        inv_txfm_ref = idct16x16_12_ref;

+        break;

+      default:

+        inv_txfm_ref = idct16x16_ref;

+        break;

+    }

+#else

+    inv_txfm_ref = idct16x16_ref;

+#endif

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

  protected:

-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {

+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {

     fwd_txfm_(in, out, stride);

-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {

+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {

     inv_txfm_(out, dst, stride);

@@ -529,17 +650,34 @@

     fwd_txfm_ = GET_PARAM(0);

     inv_txfm_ = GET_PARAM(1);

     tx_type_  = GET_PARAM(2);

+    bit_depth_ = GET_PARAM(3);

     pitch_    = 16;

     fwd_txfm_ref = fht16x16_ref;

     inv_txfm_ref = iht16x16_ref;

+    mask_ = (1 << bit_depth_) - 1;

+#if CONFIG_VP9_HIGHBITDEPTH

+    switch (bit_depth_) {

+      case VPX_BITS_10:

+        inv_txfm_ref = iht16x16_10;

+        break;

+      case VPX_BITS_12:

+        inv_txfm_ref = iht16x16_12;

+        break;

+      default:

+        inv_txfm_ref = iht16x16_ref;

+        break;

+    }

+#else

+    inv_txfm_ref = iht16x16_ref;

+#endif

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

  protected:

-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {

+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {

     fwd_txfm_(in, out, stride, tx_type_);

-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {

+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {

     inv_txfm_(out, dst, stride, tx_type_);

@@ -567,45 +705,78 @@

 using std::tr1::make_tuple;

+#if CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     C, Trans16x16DCT,

     ::testing::Values(

-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));

+        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));

+#else

 INSTANTIATE_TEST_CASE_P(

+    C, Trans16x16DCT,

+    ::testing::Values(

+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));

+#endif

+#if CONFIG_VP9_HIGHBITDEPTH

+INSTANTIATE_TEST_CASE_P(

     C, Trans16x16HT,

     ::testing::Values(

-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0),

-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1),

-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),

-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),

+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));

+#else

+INSTANTIATE_TEST_CASE_P(

+    C, Trans16x16HT,

+    ::testing::Values(

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));

+#endif

-#if HAVE_NEON_ASM

+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     NEON, Trans16x16DCT,

     ::testing::Values(

         make_tuple(&vp9_fdct16x16_c,

-                   &vp9_idct16x16_256_add_neon, 0)));

+                   &vp9_idct16x16_256_add_neon, 0, VPX_BITS_8)));

 #endif

-#if HAVE_SSE2

+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSE2, Trans16x16DCT,

     ::testing::Values(

         make_tuple(&vp9_fdct16x16_sse2,

-                   &vp9_idct16x16_256_add_sse2, 0)));

+                   &vp9_idct16x16_256_add_sse2, 0, VPX_BITS_8)));

 INSTANTIATE_TEST_CASE_P(

     SSE2, Trans16x16HT,

     ::testing::Values(

-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),

-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),

-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),

-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));

+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0,

+                   VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1,

+                   VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2,

+                   VPX_BITS_8),

+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,

+                   VPX_BITS_8)));

 #endif

-#if HAVE_SSSE3

+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSSE3, Trans16x16DCT,

     ::testing::Values(

-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));

+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0,

+                   VPX_BITS_8)));

 #endif

 }  // namespace

--- a/test/dct32x32_test.cc

+++ b/test/dct32x32_test.cc

@@ -21,6 +21,7 @@

 #include "./vpx_config.h"

 #include "./vp9_rtcd.h"

 #include "vp9/common/vp9_entropy.h"

+#include "vpx/vpx_codec.h"

 #include "vpx/vpx_integer.h"

 using libvpx_test::ACMRandom;

@@ -71,11 +72,22 @@

-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);

-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);

+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);

+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);

-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int> Trans32x32Param;

+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>

+    Trans32x32Param;

+#if CONFIG_VP9_HIGHBITDEPTH

+void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct32x32_1024_add_c(in, out, stride, 10);

+}

+void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct32x32_1024_add_c(in, out, stride, 12);

+}

+#endif

 class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {

  public:

   virtual ~Trans32x32Test() {}

@@ -84,6 +96,8 @@

     inv_txfm_ = GET_PARAM(1);

     version_  = GET_PARAM(2);  // 0: high precision forward transform

                                // 1: low precision version for rd loop

+    bit_depth_ = GET_PARAM(3);

+    mask_ = (1 << bit_depth_) - 1;

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

@@ -90,6 +104,8 @@

  protected:

   int version_;

+  vpx_bit_depth_t bit_depth_;

+  int mask_;

   FwdTxfmFunc fwd_txfm_;

   InvTxfmFunc inv_txfm_;

};

@@ -100,23 +116,47 @@

   int64_t total_error = 0;

   const int count_test_block = 1000;

   DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);

+#endif

   for (int i = 0; i < count_test_block; ++i) {

-    // Initialize a test block with input range [-255, 255].

+    // Initialize a test block with input range [-mask_, mask_].

     for (int j = 0; j < kNumCoeffs; ++j) {

-      src[j] = rnd.Rand8();

-      dst[j] = rnd.Rand8();

-      test_input_block[j] = src[j] - dst[j];

+      if (bit_depth_ == 8) {

+        src[j] = rnd.Rand8();

+        dst[j] = rnd.Rand8();

+        test_input_block[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        src16[j] = rnd.Rand16() & mask_;

+        dst16[j] = rnd.Rand16() & mask_;

+        test_input_block[j] = src16[j] - dst16[j];

+#endif

+      }

     ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));

-    ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));

+    if (bit_depth_ == VPX_BITS_8) {

+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));

+#if CONFIG_VP9_HIGHBITDEPTH

+    } else {

+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block,

+                                         CONVERT_TO_BYTEPTR(dst16), 32));

+#endif

+    }

     for (int j = 0; j < kNumCoeffs; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+      const uint32_t diff =

+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

       const uint32_t diff = dst[j] - src[j];

+#endif

       const uint32_t error = diff * diff;

       if (max_error < error)

         max_error = error;

@@ -129,10 +169,10 @@

     total_error /= 45;

-  EXPECT_GE(1u, max_error)

+  EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)

       << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";

-  EXPECT_GE(count_test_block, total_error)

+  EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)

       << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";

@@ -141,12 +181,12 @@

   const int count_test_block = 1000;

   DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);

   for (int i = 0; i < count_test_block; ++i) {

     for (int j = 0; j < kNumCoeffs; ++j)

-      input_block[j] = rnd.Rand8() - rnd.Rand8();

+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);

     const int stride = 32;

     vp9_fdct32x32_c(input_block, output_ref_block, stride);

@@ -170,21 +210,21 @@

   DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);

   for (int i = 0; i < count_test_block; ++i) {

-    // Initialize a test block with input range [-255, 255].

+    // Initialize a test block with input range [-mask_, mask_].

     for (int j = 0; j < kNumCoeffs; ++j) {

-      input_block[j] = rnd.Rand8() - rnd.Rand8();

-      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;

+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);

+      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;

     if (i == 0) {

       for (int j = 0; j < kNumCoeffs; ++j)

-        input_extreme_block[j] = 255;

+        input_extreme_block[j] = mask_;

     } else if (i == 1) {

       for (int j = 0; j < kNumCoeffs; ++j)

-        input_extreme_block[j] = -255;

+        input_extreme_block[j] = -mask_;

     const int stride = 32;

@@ -201,9 +241,9 @@

         EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))

             << "Error: 32x32 FDCT rd has mismatched coefficients";

-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))

+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))

           << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";

-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))

+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))

           << "Error: 32x32 FDCT has coefficient larger than "

           << "4*DCT_MAX_VALUE";

@@ -214,9 +254,13 @@

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   const int count_test_block = 1000;

   DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);

+#endif

   for (int i = 0; i < count_test_block; ++i) {

     double out_r[kNumCoeffs];

@@ -223,17 +267,36 @@

     // Initialize a test block with input range [-255, 255]

     for (int j = 0; j < kNumCoeffs; ++j) {

-      src[j] = rnd.Rand8();

-      dst[j] = rnd.Rand8();

-      in[j] = src[j] - dst[j];

+      if (bit_depth_ == VPX_BITS_8) {

+        src[j] = rnd.Rand8();

+        dst[j] = rnd.Rand8();

+        in[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        src16[j] = rnd.Rand16() & mask_;

+        dst16[j] = rnd.Rand16() & mask_;

+        in[j] = src16[j] - dst16[j];

+#endif

+      }

     reference_32x32_dct_2d(in, out_r);

     for (int j = 0; j < kNumCoeffs; ++j)

       coeff[j] = round(out_r[j]);

-    ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));

+    if (bit_depth_ == VPX_BITS_8) {

+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));

+#if CONFIG_VP9_HIGHBITDEPTH

+    } else {

+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));

+#endif

+    }

     for (int j = 0; j < kNumCoeffs; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+      const int diff =

+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

       const int diff = dst[j] - src[j];

+#endif

       const int error = diff * diff;

       EXPECT_GE(1, error)

           << "Error: 32x32 IDCT has error " << error

@@ -244,39 +307,59 @@

 using std::tr1::make_tuple;

+#if CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     C, Trans32x32Test,

     ::testing::Values(

-        make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),

-        make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));

+        make_tuple(&vp9_high_fdct32x32_c,

+                   &idct32x32_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fdct32x32_rd_c,

+                   &idct32x32_10, 1, VPX_BITS_10),

+        make_tuple(&vp9_high_fdct32x32_c,

+                   &idct32x32_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_high_fdct32x32_rd_c,

+                   &idct32x32_12, 1, VPX_BITS_12),

+        make_tuple(&vp9_fdct32x32_c,

+                   &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fdct32x32_rd_c,

+                   &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));

+#else

+INSTANTIATE_TEST_CASE_P(

+    C, Trans32x32Test,

+    ::testing::Values(

+        make_tuple(&vp9_fdct32x32_c,

+                   &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fdct32x32_rd_c,

+                   &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));

+#endif

-#if HAVE_NEON_ASM

+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     NEON, Trans32x32Test,

     ::testing::Values(

         make_tuple(&vp9_fdct32x32_c,

-                   &vp9_idct32x32_1024_add_neon, 0),

+                   &vp9_idct32x32_1024_add_neon, 0, VPX_BITS_8),

         make_tuple(&vp9_fdct32x32_rd_c,

-                   &vp9_idct32x32_1024_add_neon, 1)));

+                   &vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8)));

 #endif

-#if HAVE_SSE2

+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSE2, Trans32x32Test,

     ::testing::Values(

         make_tuple(&vp9_fdct32x32_sse2,

-                   &vp9_idct32x32_1024_add_sse2, 0),

+                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),

         make_tuple(&vp9_fdct32x32_rd_sse2,

-                   &vp9_idct32x32_1024_add_sse2, 1)));

+                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));

 #endif

-#if HAVE_AVX2

+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     AVX2, Trans32x32Test,

     ::testing::Values(

         make_tuple(&vp9_fdct32x32_avx2,

-                   &vp9_idct32x32_1024_add_sse2, 0),

+                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),

         make_tuple(&vp9_fdct32x32_rd_avx2,

-                   &vp9_idct32x32_1024_add_sse2, 1)));

+                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));

 #endif

 }  // namespace

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -20,46 +20,71 @@

 #include "./vp9_rtcd.h"

 #include "vp9/common/vp9_entropy.h"

+#include "vpx/vpx_codec.h"

 #include "vpx/vpx_integer.h"

-extern "C" {

-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);

-}

 using libvpx_test::ACMRandom;

 namespace {

 const int kNumCoeffs = 16;

-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);

-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);

-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,

+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);

+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);

+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,

                         int tx_type);

-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,

+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

                         int tx_type);

-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param;

-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param;

+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;

+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;

-void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {

+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,

+                 int tx_type) {

   vp9_fdct4x4_c(in, out, stride);

-void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {

+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {

   vp9_fht4x4_c(in, out, stride, tx_type);

-void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {

+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,

+                 int tx_type) {

   vp9_fwht4x4_c(in, out, stride);

+#if CONFIG_VP9_HIGHBITDEPTH

+void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct4x4_16_add_c(in, out, stride, 10);

+}

+void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct4x4_16_add_c(in, out, stride, 12);

+}

+void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

+  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 10);

+}

+void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

+  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 12);

+}

+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_iwht4x4_16_add_c(in, out, stride, 10);

+}

+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_iwht4x4_16_add_c(in, out, stride, 12);

+}

+#endif

 class Trans4x4TestBase {

  public:

   virtual ~Trans4x4TestBase() {}

  protected:

-  virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;

+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;

-  virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;

+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;

   void RunAccuracyCheck(int limit) {

     ACMRandom rnd(ACMRandom::DeterministicSeed());

@@ -68,23 +93,47 @@

     const int count_test_block = 10000;

     for (int i = 0; i < count_test_block; ++i) {

       DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);

-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);

+      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);

       DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

       DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);

+#endif

       // Initialize a test block with input range [-255, 255].

       for (int j = 0; j < kNumCoeffs; ++j) {

-        src[j] = rnd.Rand8();

-        dst[j] = rnd.Rand8();

-        test_input_block[j] = src[j] - dst[j];

+        if (bit_depth_ == VPX_BITS_8) {

+          src[j] = rnd.Rand8();

+          dst[j] = rnd.Rand8();

+          test_input_block[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+        } else {

+          src16[j] = rnd.Rand16() & mask_;

+          dst16[j] = rnd.Rand16() & mask_;

+          test_input_block[j] = src16[j] - dst16[j];

+#endif

+        }

       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,

                                           test_temp_block, pitch_));

-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));

+      if (bit_depth_ == VPX_BITS_8) {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,

+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));

+#endif

+      }

       for (int j = 0; j < kNumCoeffs; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+        const uint32_t diff =

+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

         const uint32_t diff = dst[j] - src[j];

+#endif

         const uint32_t error = diff * diff;

         if (max_error < error)

           max_error = error;

@@ -105,13 +154,13 @@

     ACMRandom rnd(ACMRandom::DeterministicSeed());

     const int count_test_block = 5000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);

     for (int i = 0; i < count_test_block; ++i) {

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < kNumCoeffs; ++j)

-        input_block[j] = rnd.Rand8() - rnd.Rand8();

+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);

       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);

       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));

@@ -127,21 +176,21 @@

     const int count_test_block = 5000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);

     for (int i = 0; i < count_test_block; ++i) {

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < kNumCoeffs; ++j) {

-        input_block[j] = rnd.Rand8() - rnd.Rand8();

-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;

+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);

+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;

       if (i == 0) {

         for (int j = 0; j < kNumCoeffs; ++j)

-          input_extreme_block[j] = 255;

+          input_extreme_block[j] = mask_;

       } else if (i == 1) {

         for (int j = 0; j < kNumCoeffs; ++j)

-          input_extreme_block[j] = -255;

+          input_extreme_block[j] = -mask_;

       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);

@@ -151,8 +200,8 @@

       // The minimum quant value is 4.

       for (int j = 0; j < kNumCoeffs; ++j) {

         EXPECT_EQ(output_block[j], output_ref_block[j]);

-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))

-            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";

+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))

+            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";

@@ -161,24 +210,48 @@

     ACMRandom rnd(ACMRandom::DeterministicSeed());

     const int count_test_block = 1000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);

+#endif

     for (int i = 0; i < count_test_block; ++i) {

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < kNumCoeffs; ++j) {

-        src[j] = rnd.Rand8();

-        dst[j] = rnd.Rand8();

-        in[j] = src[j] - dst[j];

+        if (bit_depth_ == VPX_BITS_8) {

+          src[j] = rnd.Rand8();

+          dst[j] = rnd.Rand8();

+          in[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+        } else {

+          src16[j] = rnd.Rand16() & mask_;

+          dst16[j] = rnd.Rand16() & mask_;

+          in[j] = src16[j] - dst16[j];

+#endif

+        }

       fwd_txfm_ref(in, coeff, pitch_, tx_type_);

-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));

+      if (bit_depth_ == VPX_BITS_8) {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),

+                                            pitch_));

+#endif

+      }

       for (int j = 0; j < kNumCoeffs; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+        const uint32_t diff =

+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

         const uint32_t diff = dst[j] - src[j];

+#endif

         const uint32_t error = diff * diff;

         EXPECT_GE(static_cast<uint32_t>(limit), error)

             << "Error: 4x4 IDCT has error " << error

@@ -190,6 +263,8 @@

   int pitch_;

   int tx_type_;

   FhtFunc fwd_txfm_ref;

+  vpx_bit_depth_t bit_depth_;

+  int mask_;

};

 class Trans4x4DCT

@@ -204,14 +279,16 @@

     tx_type_  = GET_PARAM(2);

     pitch_    = 4;

     fwd_txfm_ref = fdct4x4_ref;

+    bit_depth_ = GET_PARAM(3);

+    mask_ = (1 << bit_depth_) - 1;

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

  protected:

-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {

+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {

     fwd_txfm_(in, out, stride);

-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {

+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {

     inv_txfm_(out, dst, stride);

@@ -247,15 +324,17 @@

     tx_type_  = GET_PARAM(2);

     pitch_    = 4;

     fwd_txfm_ref = fht4x4_ref;

+    bit_depth_ = GET_PARAM(3);

+    mask_ = (1 << bit_depth_) - 1;

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

  protected:

-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {

+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {

     fwd_txfm_(in, out, stride, tx_type_);

-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {

+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {

     inv_txfm_(out, dst, stride, tx_type_);

@@ -291,14 +370,16 @@

     tx_type_  = GET_PARAM(2);

     pitch_    = 4;

     fwd_txfm_ref = fwht4x4_ref;

+    bit_depth_ = GET_PARAM(3);

+    mask_ = (1 << bit_depth_) - 1;

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

  protected:

-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {

+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {

     fwd_txfm_(in, out, stride);

-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {

+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {

     inv_txfm_(out, dst, stride);

@@ -323,57 +404,95 @@

 using std::tr1::make_tuple;

+#if CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     C, Trans4x4DCT,

     ::testing::Values(

-        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));

+        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));

+#else

 INSTANTIATE_TEST_CASE_P(

+    C, Trans4x4DCT,

+    ::testing::Values(

+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));

+#endif

+#if CONFIG_VP9_HIGHBITDEPTH

+INSTANTIATE_TEST_CASE_P(

     C, Trans4x4HT,

     ::testing::Values(

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0),

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),

+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));

+#else

 INSTANTIATE_TEST_CASE_P(

+    C, Trans4x4HT,

+    ::testing::Values(

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));

+#endif

+#if CONFIG_VP9_HIGHBITDEPTH

+INSTANTIATE_TEST_CASE_P(

     C, Trans4x4WHT,

     ::testing::Values(

-        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0)));

+        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));

+#else

+INSTANTIATE_TEST_CASE_P(

+    C, Trans4x4WHT,

+    ::testing::Values(

+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));

+#endif

-#if HAVE_NEON_ASM

+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     NEON, Trans4x4DCT,

     ::testing::Values(

         make_tuple(&vp9_fdct4x4_c,

-                   &vp9_idct4x4_16_add_neon, 0)));

+                   &vp9_idct4x4_16_add_neon, 0, VPX_BITS_8)));

 INSTANTIATE_TEST_CASE_P(

     DISABLED_NEON, Trans4x4HT,

     ::testing::Values(

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0),

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1),

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2),

-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));

 #endif

-#if CONFIG_USE_X86INC && HAVE_MMX

+#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     MMX, Trans4x4WHT,

     ::testing::Values(

-        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));

+        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));

 #endif

-#if HAVE_SSE2

+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSE2, Trans4x4DCT,

     ::testing::Values(

         make_tuple(&vp9_fdct4x4_sse2,

-                   &vp9_idct4x4_16_add_sse2, 0)));

+                   &vp9_idct4x4_16_add_sse2, 0, VPX_BITS_8)));

 INSTANTIATE_TEST_CASE_P(

     SSE2, Trans4x4HT,

     ::testing::Values(

-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),

-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),

-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),

-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));

+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));

 #endif

 }  // namespace

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -20,45 +20,96 @@

 #include "./vp9_rtcd.h"

 #include "vp9/common/vp9_entropy.h"

+#include "vpx/vpx_codec.h"

 #include "vpx/vpx_integer.h"

-extern "C" {

-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);

+const int kNumCoeffs = 64;

+const double kPi = 3.141592653589793238462643383279502884;

+void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {

+  const double kInvSqrt2 = 0.707106781186547524400844362104;

+  for (int k = 0; k < 8; k++) {

+    out[k] = 0.0;

+    for (int n = 0; n < 8; n++)

+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0);

+    if (k == 0)

+      out[k] = out[k] * kInvSqrt2;

+  }

+void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],

+                          double output[kNumCoeffs]) {

+  // First transform columns

+  for (int i = 0; i < 8; ++i) {

+    double temp_in[8], temp_out[8];

+    for (int j = 0; j < 8; ++j)

+      temp_in[j] = input[j*8 + i];

+    reference_8x8_dct_1d(temp_in, temp_out, 1);

+    for (int j = 0; j < 8; ++j)

+      output[j * 8 + i] = temp_out[j];

+  }

+  // Then transform rows

+  for (int i = 0; i < 8; ++i) {

+    double temp_in[8], temp_out[8];

+    for (int j = 0; j < 8; ++j)

+      temp_in[j] = output[j + i*8];

+    reference_8x8_dct_1d(temp_in, temp_out, 1);

+    // Scale by some magic number

+    for (int j = 0; j < 8; ++j)

+      output[j + i * 8] = temp_out[j] * 2;

+  }

+}

 using libvpx_test::ACMRandom;

 namespace {

-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);

-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);

-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,

+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);

+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);

+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,

                         int tx_type);

-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,

+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,

                         int tx_type);

-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param;

-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param;

+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;

+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;

-void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {

+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {

   vp9_fdct8x8_c(in, out, stride);

-void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {

+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {

   vp9_fht8x8_c(in, out, stride, tx_type);

+#if CONFIG_VP9_HIGHBITDEPTH

+void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct8x8_64_add_c(in, out, stride, 10);

+}

+void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {

+  vp9_high_idct8x8_64_add_c(in, out, stride, 12);

+}

+void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

+  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 10);

+}

+void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

+  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 12);

+}

+#endif

 class FwdTrans8x8TestBase {

  public:

   virtual ~FwdTrans8x8TestBase() {}

  protected:

-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;

-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;

+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;

+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;

   void RunSignBiasCheck() {

     ACMRandom rnd(ACMRandom::DeterministicSeed());

     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_output_block, 64);

     int count_sign_block[64][2];

     const int count_test_block = 100000;

@@ -67,7 +118,8 @@

     for (int i = 0; i < count_test_block; ++i) {

       // Initialize a test block with input range [-255, 255].

       for (int j = 0; j < 64; ++j)

-        test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+        test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) -

+                              ((rnd.Rand16() >> (16 - bit_depth_)) & mask_);

       ASM_REGISTER_STATE_CHECK(

           RunFwdTxfm(test_input_block, test_output_block, pitch_));

@@ -82,7 +134,7 @@

     for (int j = 0; j < 64; ++j) {

       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);

       const int max_diff = 1125;

-      EXPECT_LT(diff, max_diff)

+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))

           << "Error: 8x8 FDCT/FHT has a sign bias > "

           << 1. * max_diff / count_test_block * 100 << "%"

           << " for input range [-255, 255] at index " << j

@@ -111,7 +163,7 @@

     for (int j = 0; j < 64; ++j) {

       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);

       const int max_diff = 10000;

-      EXPECT_LT(diff, max_diff)

+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))

           << "Error: 4x4 FDCT/FHT has a sign bias > "

           << 1. * max_diff / count_test_block * 100 << "%"

           << " for input range [-15, 15] at index " << j

@@ -127,16 +179,28 @@

     int total_error = 0;

     const int count_test_block = 100000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);

+#if CONFIG_VP9_HIGHBITDEPTH

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);

+#endif

     for (int i = 0; i < count_test_block; ++i) {

       // Initialize a test block with input range [-255, 255].

       for (int j = 0; j < 64; ++j) {

-        src[j] = rnd.Rand8();

-        dst[j] = rnd.Rand8();

-        test_input_block[j] = src[j] - dst[j];

+        if (bit_depth_ == VPX_BITS_8) {

+          src[j] = rnd.Rand8();

+          dst[j] = rnd.Rand8();

+          test_input_block[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+        } else {

+          src16[j] = rnd.Rand16() & mask_;

+          dst16[j] = rnd.Rand16() & mask_;

+          test_input_block[j] = src16[j] - dst16[j];

+#endif

+        }

       ASM_REGISTER_STATE_CHECK(

@@ -152,11 +216,23 @@

             test_temp_block[j] *= 4;

-      ASM_REGISTER_STATE_CHECK(

-          RunInvTxfm(test_temp_block, dst, pitch_));

+      if (bit_depth_ == VPX_BITS_8) {

+        ASM_REGISTER_STATE_CHECK(

+            RunInvTxfm(test_temp_block, dst, pitch_));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        ASM_REGISTER_STATE_CHECK(

+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+#endif

+      }

       for (int j = 0; j < 64; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+        const int diff =

+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

         const int diff = dst[j] - src[j];

+#endif

         const int error = diff * diff;

         if (max_error < error)

           max_error = error;

@@ -164,11 +240,11 @@

-    EXPECT_GE(1, max_error)

+    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)

       << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"

       << " roundtrip error > 1";

-    EXPECT_GE(count_test_block/5, total_error)

+    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)

       << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "

       << "error > 1/5 per block";

@@ -180,26 +256,45 @@

     int total_coeff_error = 0;

     const int count_test_block = 100000;

     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);

-    DECLARE_ALIGNED_ARRAY(16, int16_t, ref_temp_block, 64);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_temp_block, 64);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);

     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);

+#if CONFIG_VP9_HIGHBITDEPTH

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);

+#endif

     for (int i = 0; i < count_test_block; ++i) {

-      // Initialize a test block with input range [-255, 255].

+      // Initialize a test block with input range [-mask_, mask_].

       for (int j = 0; j < 64; ++j) {

-        if (i == 0) {

-          src[j] = 255;

-          dst[j] = 0;

-        } else if (i == 1) {

-          src[j] = 0;

-          dst[j] = 255;

+        if (bit_depth_ == VPX_BITS_8) {

+          if (i == 0) {

+            src[j] = 255;

+            dst[j] = 0;

+          } else if (i == 1) {

+            src[j] = 0;

+            dst[j] = 255;

+          } else {

+            src[j] = rnd.Rand8() % 2 ? 255 : 0;

+            dst[j] = rnd.Rand8() % 2 ? 255 : 0;

+          }

+          test_input_block[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

         } else {

-          src[j] = rnd.Rand8() % 2 ? 255 : 0;

-          dst[j] = rnd.Rand8() % 2 ? 255 : 0;

+          if (i == 0) {

+            src16[j] = mask_;

+            dst16[j] = 0;

+          } else if (i == 1) {

+            src16[j] = 0;

+            dst16[j] = mask_;

+          } else {

+            src16[j] = rnd.Rand8() % 2 ? mask_ : 0;

+            dst16[j] = rnd.Rand8() % 2 ? mask_ : 0;

+          }

+          test_input_block[j] = src16[j] - dst16[j];

+#endif

-        test_input_block[j] = src[j] - dst[j];

       ASM_REGISTER_STATE_CHECK(

@@ -206,11 +301,23 @@

           RunFwdTxfm(test_input_block, test_temp_block, pitch_));

       ASM_REGISTER_STATE_CHECK(

           fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));

-      ASM_REGISTER_STATE_CHECK(

-          RunInvTxfm(test_temp_block, dst, pitch_));

+      if (bit_depth_ == VPX_BITS_8) {

+        ASM_REGISTER_STATE_CHECK(

+            RunInvTxfm(test_temp_block, dst, pitch_));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        ASM_REGISTER_STATE_CHECK(

+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+#endif

+      }

       for (int j = 0; j < 64; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+        const int diff =

+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

         const int diff = dst[j] - src[j];

+#endif

         const int error = diff * diff;

         if (max_error < error)

           max_error = error;

@@ -220,11 +327,11 @@

         total_coeff_error += abs(coeff_diff);

-      EXPECT_GE(1, max_error)

+      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)

           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"

           << "an individual roundtrip error > 1";

-      EXPECT_GE(count_test_block/5, total_error)

+      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)

           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"

           << " roundtrip error > 1/5 per block";

@@ -234,9 +341,97 @@

+  void RunInvAccuracyCheck() {

+    ACMRandom rnd(ACMRandom::DeterministicSeed());

+    const int count_test_block = 1000;

+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);

+#if CONFIG_VP9_HIGHBITDEPTH

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);

+#endif

+    for (int i = 0; i < count_test_block; ++i) {

+      double out_r[kNumCoeffs];

+      // Initialize a test block with input range [-255, 255].

+      for (int j = 0; j < kNumCoeffs; ++j) {

+        if (bit_depth_ == VPX_BITS_8) {

+          src[j] = rnd.Rand8() % 2 ? 255 : 0;

+          dst[j] = src[j] > 0 ? 0 : 255;

+          in[j] = src[j] - dst[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+        } else {

+          src16[j] = rnd.Rand8() % 2 ? mask_ : 0;

+          dst16[j] = src16[j] > 0 ? 0 : mask_;

+          in[j] = src16[j] - dst16[j];

+#endif

+        }

+      }

+      reference_8x8_dct_2d(in, out_r);

+      for (int j = 0; j < kNumCoeffs; ++j)

+        coeff[j] = round(out_r[j]);

+      if (bit_depth_ == VPX_BITS_8) {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));

+#if CONFIG_VP9_HIGHBITDEPTH

+      } else {

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),

+                                            pitch_));

+#endif

+      }

+      for (int j = 0; j < kNumCoeffs; ++j) {

+#if CONFIG_VP9_HIGHBITDEPTH

+        const uint32_t diff =

+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];

+#else

+        const uint32_t diff = dst[j] - src[j];

+#endif

+        const uint32_t error = diff * diff;

+        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)

+            << "Error: 8x8 IDCT has error " << error

+            << " at index " << j;

+      }

+    }

+  }

+  void RunFwdAccuracyCheck() {

+    ACMRandom rnd(ACMRandom::DeterministicSeed());

+    const int count_test_block = 1000;

+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_r, kNumCoeffs);

+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);

+    for (int i = 0; i < count_test_block; ++i) {

+      double out_r[kNumCoeffs];

+      // Initialize a test block with input range [-mask_, mask_].

+      for (int j = 0; j < kNumCoeffs; ++j)

+        in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_;

+      RunFwdTxfm(in, coeff, pitch_);

+      reference_8x8_dct_2d(in, out_r);

+      for (int j = 0; j < kNumCoeffs; ++j)

+        coeff_r[j] = round(out_r[j]);

+      for (int j = 0; j < kNumCoeffs; ++j) {

+        const uint32_t diff = coeff[j] - coeff_r[j];

+        const uint32_t error = diff * diff;

+        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)

+            << "Error: 8x8 DCT has error " << error

+            << " at index " << j;

+      }

+    }

+  }

   int pitch_;

   int tx_type_;

   FhtFunc fwd_txfm_ref;

+  vpx_bit_depth_t bit_depth_;

+  int mask_;

};

 class FwdTrans8x8DCT

@@ -251,15 +446,17 @@

     tx_type_  = GET_PARAM(2);

     pitch_    = 8;

     fwd_txfm_ref = fdct8x8_ref;

+    bit_depth_ = GET_PARAM(3);

+    mask_ = (1 << bit_depth_) - 1;

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

  protected:

-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {

+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {

     fwd_txfm_(in, out, stride);

-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {

+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {

     inv_txfm_(out, dst, stride);

@@ -279,6 +476,14 @@

   RunExtremalCheck();

+TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) {

+  RunFwdAccuracyCheck();

+}

+TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) {

+  RunInvAccuracyCheck();

+}

 class FwdTrans8x8HT

     : public FwdTrans8x8TestBase,

       public ::testing::TestWithParam<Ht8x8Param> {

@@ -291,15 +496,17 @@

     tx_type_  = GET_PARAM(2);

     pitch_    = 8;

     fwd_txfm_ref = fht8x8_ref;

+    bit_depth_ = GET_PARAM(3);

+    mask_ = (1 << bit_depth_) - 1;

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

  protected:

-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {

+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {

     fwd_txfm_(in, out, stride, tx_type_);

-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {

+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {

     inv_txfm_(out, dst, stride, tx_type_);

@@ -321,50 +528,81 @@

 using std::tr1::make_tuple;

+#if CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     C, FwdTrans8x8DCT,

     ::testing::Values(

-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0)));

+        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));

+#else

 INSTANTIATE_TEST_CASE_P(

+    C, FwdTrans8x8DCT,

+    ::testing::Values(

+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));

+#endif

+#if CONFIG_VP9_HIGHBITDEPTH

+INSTANTIATE_TEST_CASE_P(

     C, FwdTrans8x8HT,

     ::testing::Values(

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0),

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1),

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),

+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));

+#else

+INSTANTIATE_TEST_CASE_P(

+    C, FwdTrans8x8HT,

+    ::testing::Values(

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));

+#endif

-#if HAVE_NEON_ASM

+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     NEON, FwdTrans8x8DCT,

     ::testing::Values(

-        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));

+        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0,

+                   VPX_BITS_8)));

 INSTANTIATE_TEST_CASE_P(

     DISABLED_NEON, FwdTrans8x8HT,

     ::testing::Values(

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0),

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1),

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2),

-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3)));

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));

 #endif

-#if HAVE_SSE2

+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSE2, FwdTrans8x8DCT,

     ::testing::Values(

-        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0)));

+        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0,

+                   VPX_BITS_8)));

 INSTANTIATE_TEST_CASE_P(

     SSE2, FwdTrans8x8HT,

     ::testing::Values(

-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),

-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),

-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),

-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));

+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),

+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));

 #endif

-#if HAVE_SSSE3 && ARCH_X86_64

+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSSE3, FwdTrans8x8DCT,

     ::testing::Values(

-        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));

+        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,

+                   VPX_BITS_8)));

 #endif

 }  // namespace

--- a/test/idct8x8_test.cc

+++ b/test/idct8x8_test.cc

@@ -109,7 +109,8 @@

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   const int count_test_block = 10000;

   for (int i = 0; i < count_test_block; ++i) {

-    int16_t input[64], coeff[64];

+    int16_t input[64];

+    tran_low_t coeff[64];

     double output_r[64];

     uint8_t dst[64], src[64];

--- a/test/partial_idct_test.cc

+++ b/test/partial_idct_test.cc

@@ -26,8 +26,8 @@

 using libvpx_test::ACMRandom;

 namespace {

-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);

-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);

+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);

+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);

 typedef std::tr1::tuple<FwdTxfmFunc,

                         InvTxfmFunc,

                         InvTxfmFunc,

@@ -74,8 +74,8 @@

       FAIL() << "Wrong Size!";

       break;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);

@@ -83,7 +83,7 @@

   const int block_size = size * size;

   DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kMaxNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kMaxNumCoeffs);

   int max_error = 0;

   for (int i = 0; i < count_test_block; ++i) {

@@ -153,8 +153,8 @@

       FAIL() << "Wrong Size!";

       break;

-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);

-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);

+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);

   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);

   const int count_test_block = 1000;

@@ -229,6 +229,7 @@

                    &vp9_idct4x4_16_add_c,

                    &vp9_idct4x4_1_add_c,

                    TX_4X4, 1)));

 #if HAVE_NEON_ASM

 INSTANTIATE_TEST_CASE_P(

     NEON, PartialIDctTest,

@@ -259,7 +260,7 @@

                    TX_4X4, 1)));

 #endif

-#if HAVE_SSE2

+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSE2, PartialIDctTest,

     ::testing::Values(

@@ -293,7 +294,7 @@

                    TX_4X4, 1)));

 #endif

-#if HAVE_SSSE3 && ARCH_X86_64

+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSSE3_64, PartialIDctTest,

     ::testing::Values(

@@ -303,7 +304,7 @@

                    TX_8X8, 12)));

 #endif

-#if HAVE_SSSE3

+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSSE3, PartialIDctTest,

     ::testing::Values(

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -21,6 +21,7 @@

 #include "vp9/common/vp9_common_data.h"

 #include "vp9/common/vp9_enums.h"

 #include "vp9/common/vp9_filter.h"

+#include "vp9/common/vp9_idct.h"

 #include "vp9/common/vp9_mv.h"

 #include "vp9/common/vp9_scale.h"

 #include "vp9/common/vp9_seg_common.h"

@@ -176,7 +177,7 @@

};

 struct macroblockd_plane {

-  int16_t *dqcoeff;

+  tran_low_t *dqcoeff;

   PLANE_TYPE plane_type;

   int subsampling_x;

   int subsampling_y;

@@ -223,11 +224,17 @@

   /* mc buffer */

   DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);

+#if CONFIG_VP9_HIGHBITDEPTH

+  /* Bit depth: 8, 10, 12 */

+  int bd;

+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);

+#endif

   int lossless;

   int corrupted;

-  DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);

+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);

   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];

   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -18,14 +18,47 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_idct.h"

-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {

+#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH

+// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict

+// overflow wrapping to match expected hardware implementations.

+// bd of 8 uses trans_low with 16bits, need to remove 16bits

+// bd of 10 uses trans_low with 18bits, need to remove 14bits

+// bd of 12 uses trans_low with 20bits, need to remove 12bits

+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits

+#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))

+#else

+#define WRAPLOW(x) (x)

+#endif  // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH

+#if CONFIG_VP9_HIGHBITDEPTH

+static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,

+                                    tran_low_t high) {

+  return value < low ? low : (value > high ? high : value);

+}

+static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,

+                                            tran_high_t trans, int bd) {

+  trans = WRAPLOW(trans);

+  switch (bd) {

+    case 8:

+    default:

+      return clamp_high(WRAPLOW(dest + trans), 0, 255);

+    case 10:

+      return clamp_high(WRAPLOW(dest + trans), 0, 1023);

+    case 12:

+      return clamp_high(WRAPLOW(dest + trans), 0, 4095);

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

    0.5 shifts per pixel. */

   int i;

-  int16_t output[16];

-  int a1, b1, c1, d1, e1;

-  const int16_t *ip = input;

-  int16_t *op = output;

+  tran_low_t output[16];

+  tran_high_t a1, b1, c1, d1, e1;

+  const tran_low_t *ip = input;

+  tran_low_t *op = output;

   for (i = 0; i < 4; i++) {

     a1 = ip[0] >> UNIT_QUANT_SHIFT;

@@ -70,12 +103,12 @@

-void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {

+void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {

   int i;

-  int a1, e1;

-  int16_t tmp[4];

-  const int16_t *ip = in;

-  int16_t *op = tmp;

+  tran_high_t a1, e1;

+  tran_low_t tmp[4];

+  const tran_low_t *ip = in;

+  tran_low_t *op = tmp;

   a1 = ip[0] >> UNIT_QUANT_SHIFT;

   e1 = a1 >> 1;

@@ -96,9 +129,9 @@

-static void idct4(const int16_t *input, int16_t *output) {

-  int16_t step[4];

-  int temp1, temp2;

+static void idct4(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step[4];

+  tran_high_t temp1, temp2;

   // stage 1

   temp1 = (input[0] + input[2]) * cospi_16_64;

   temp2 = (input[0] - input[2]) * cospi_16_64;

@@ -116,11 +149,11 @@

   output[3] = step[0] - step[3];

-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {

-  int16_t out[4 * 4];

-  int16_t *outptr = out;

+void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

   int i, j;

-  int16_t temp_in[4], temp_out[4];

+  tran_low_t temp_in[4], temp_out[4];

   // Rows

   for (i = 0; i < 4; ++i) {

@@ -140,10 +173,11 @@

-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {

+void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,

+                         int dest_stride) {

   int i;

-  int a1;

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  tran_high_t a1;

+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

   a1 = ROUND_POWER_OF_TWO(out, 4);

@@ -156,9 +190,9 @@

-static void idct8(const int16_t *input, int16_t *output) {

-  int16_t step1[8], step2[8];

-  int temp1, temp2;

+static void idct8(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step1[8], step2[8];

+  tran_high_t temp1, temp2;

   // stage 1

   step1[0] = input[0];

   step1[2] = input[4];

@@ -201,11 +235,11 @@

   output[7] = step1[0] - step1[7];

-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {

-  int16_t out[8 * 8];

-  int16_t *outptr = out;

+void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

   int i, j;

-  int16_t temp_in[8], temp_out[8];

+  tran_low_t temp_in[8], temp_out[8];

   // First transform rows

   for (i = 0; i < 8; ++i) {

@@ -225,10 +259,10 @@

-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {

+void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

   int i, j;

-  int a1;

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  tran_high_t a1;

+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

   a1 = ROUND_POWER_OF_TWO(out, 5);

   for (j = 0; j < 8; ++j) {

@@ -238,13 +272,13 @@

-static void iadst4(const int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

+static void iadst4(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  int x0 = input[0];

-  int x1 = input[1];

-  int x2 = input[2];

-  int x3 = input[3];

+  tran_high_t x0 = input[0];

+  tran_high_t x1 = input[1];

+  tran_high_t x2 = input[2];

+  tran_high_t x3 = input[3];

   if (!(x0 | x1 | x2 | x3)) {

     output[0] = output[1] = output[2] = output[3] = 0;

@@ -280,7 +314,7 @@

   output[3] = dct_const_round_shift(s3);

-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,

+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,

                          int tx_type) {

   const transform_2d IHT_4[] = {

     { idct4, idct4  },  // DCT_DCT  = 0

@@ -290,9 +324,9 @@

};

   int i, j;

-  int16_t out[4 * 4];

-  int16_t *outptr = out;

-  int16_t temp_in[4], temp_out[4];

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[4], temp_out[4];

   // inverse transform row vectors

   for (i = 0; i < 4; ++i) {

@@ -311,17 +345,17 @@

                                   + dest[j * stride + i]);

-static void iadst8(const int16_t *input, int16_t *output) {

+static void iadst8(const tran_low_t *input, tran_low_t *output) {

   int s0, s1, s2, s3, s4, s5, s6, s7;

-  int x0 = input[7];

-  int x1 = input[0];

-  int x2 = input[5];

-  int x3 = input[2];

-  int x4 = input[3];

-  int x5 = input[4];

-  int x6 = input[1];

-  int x7 = input[6];

+  tran_high_t x0 = input[7];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[5];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[3];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[1];

+  tran_high_t x7 = input[6];

   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

     output[0] = output[1] = output[2] = output[3] = output[4]

@@ -395,12 +429,12 @@

   { iadst8, iadst8 }   // ADST_ADST = 3

};

-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,

+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,

                          int tx_type) {

   int i, j;

-  int16_t out[8 * 8];

-  int16_t *outptr = out;

-  int16_t temp_in[8], temp_out[8];

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[8], temp_out[8];

   const transform_2d ht = IHT_8[tx_type];

   // inverse transform row vectors

@@ -421,11 +455,11 @@

-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {

-  int16_t out[8 * 8] = { 0 };

-  int16_t *outptr = out;

+void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

   int i, j;

-  int16_t temp_in[8], temp_out[8];

+  tran_low_t temp_in[8], temp_out[8];

   // First transform rows

   // only first 4 row has non-zero coefs

@@ -446,9 +480,9 @@

-static void idct16(const int16_t *input, int16_t *output) {

-  int16_t step1[16], step2[16];

-  int temp1, temp2;

+static void idct16(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step1[16], step2[16];

+  tran_high_t temp1, temp2;

   // stage 1

   step1[0] = input[0/2];

@@ -611,11 +645,12 @@

   output[15] = step2[0] - step2[15];

-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {

-  int16_t out[16 * 16];

-  int16_t *outptr = out;

+void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,

+                             int stride) {

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

   int i, j;

-  int16_t temp_in[16], temp_out[16];

+  tran_low_t temp_in[16], temp_out[16];

   // First transform rows

   for (i = 0; i < 16; ++i) {

@@ -635,25 +670,26 @@

-static void iadst16(const int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

+static void iadst16(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

+  tran_high_t s9, s10, s11, s12, s13, s14, s15;

-  int x0 = input[15];

-  int x1 = input[0];

-  int x2 = input[13];

-  int x3 = input[2];

-  int x4 = input[11];

-  int x5 = input[4];

-  int x6 = input[9];

-  int x7 = input[6];

-  int x8 = input[7];

-  int x9 = input[8];

-  int x10 = input[5];

-  int x11 = input[10];

-  int x12 = input[3];

-  int x13 = input[12];

-  int x14 = input[1];

-  int x15 = input[14];

+  tran_high_t x0 = input[15];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[13];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[11];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[9];

+  tran_high_t x7 = input[6];

+  tran_high_t x8 = input[7];

+  tran_high_t x9 = input[8];

+  tran_high_t x10 = input[5];

+  tran_high_t x11 = input[10];

+  tran_high_t x12 = input[3];

+  tran_high_t x13 = input[12];

+  tran_high_t x14 = input[1];

+  tran_high_t x15 = input[14];

   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

@@ -813,12 +849,12 @@

   { iadst16, iadst16 }   // ADST_ADST = 3

};

-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,

+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,

                             int tx_type) {

   int i, j;

-  int16_t out[16 * 16];

-  int16_t *outptr = out;

-  int16_t temp_in[16], temp_out[16];

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[16], temp_out[16];

   const transform_2d ht = IHT_16[tx_type];

   // Rows

@@ -839,11 +875,12 @@

-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {

-  int16_t out[16 * 16] = { 0 };

-  int16_t *outptr = out;

+void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,

+                            int stride) {

+  tran_low_t out[16 * 16] = { 0 };

+  tran_low_t *outptr = out;

   int i, j;

-  int16_t temp_in[16], temp_out[16];

+  tran_low_t temp_in[16], temp_out[16];

   // First transform rows. Since all non-zero dct coefficients are in

   // upper-left 4x4 area, we only need to calculate first 4 rows here.

@@ -864,10 +901,10 @@

-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {

+void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

   int i, j;

-  int a1;

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  tran_high_t a1;

+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

   a1 = ROUND_POWER_OF_TWO(out, 6);

   for (j = 0; j < 16; ++j) {

@@ -877,9 +914,9 @@

-static void idct32(const int16_t *input, int16_t *output) {

-  int16_t step1[32], step2[32];

-  int temp1, temp2;

+static void idct32(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step1[32], step2[32];

+  tran_high_t temp1, temp2;

   // stage 1

   step1[0] = input[0];

@@ -1244,11 +1281,12 @@

   output[31] = step1[0] - step1[31];

-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {

-  int16_t out[32 * 32];

-  int16_t *outptr = out;

+void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,

+                              int stride) {

+  tran_low_t out[32 * 32];

+  tran_low_t *outptr = out;

   int i, j;

-  int16_t temp_in[32], temp_out[32];

+  tran_low_t temp_in[32], temp_out[32];

   // Rows

   for (i = 0; i < 32; ++i) {

@@ -1265,7 +1303,7 @@

     if (zero_coeff[0] | zero_coeff[1])

       idct32(input, outptr);

     else

-      vpx_memset(outptr, 0, sizeof(int16_t) * 32);

+      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);

     input += 32;

     outptr += 32;

@@ -1281,11 +1319,12 @@

-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {

-  int16_t out[32 * 32] = {0};

-  int16_t *outptr = out;

+void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,

+                            int stride) {

+  tran_low_t out[32 * 32] = {0};

+  tran_low_t *outptr = out;

   int i, j;

-  int16_t temp_in[32], temp_out[32];

+  tran_low_t temp_in[32], temp_out[32];

   // Rows

   // only upper-left 8x8 has non-zero coeff

@@ -1306,11 +1345,11 @@

-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {

+void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

   int i, j;

-  int a1;

+  tran_high_t a1;

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

   a1 = ROUND_POWER_OF_TWO(out, 6);

@@ -1322,7 +1361,8 @@

 // idct

-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {

+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                     int eob) {

   if (eob > 1)

     vp9_idct4x4_16_add(input, dest, stride);

   else

@@ -1330,7 +1370,8 @@

-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {

+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                     int eob) {

   if (eob > 1)

     vp9_iwht4x4_16_add(input, dest, stride);

   else

@@ -1337,7 +1378,8 @@

     vp9_iwht4x4_1_add(input, dest, stride);

-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {

+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,

+                     int eob) {

   // If dc is 1, then input[0] is the reconstructed value, do not need

   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

@@ -1354,7 +1396,7 @@

     vp9_idct8x8_64_add(input, dest, stride);

-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,

+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,

                        int eob) {

   /* The calculation can be simplified if there are not many non-zero dct

    * coefficients. Use eobs to separate different cases. */

@@ -1367,7 +1409,7 @@

     vp9_idct16x16_256_add(input, dest, stride);

-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,

+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,

                        int eob) {

   if (eob == 1)

     vp9_idct32x32_1_add(input, dest, stride);

@@ -1379,7 +1421,7 @@

 // iht

-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,

                     int stride, int eob) {

   if (tx_type == DCT_DCT)

     vp9_idct4x4_add(input, dest, stride, eob);

@@ -1387,7 +1429,7 @@

     vp9_iht4x4_16_add(input, dest, stride, tx_type);

-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,

                     int stride, int eob) {

   if (tx_type == DCT_DCT) {

     vp9_idct8x8_add(input, dest, stride, eob);

@@ -1396,7 +1438,7 @@

-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,

                       int stride, int eob) {

   if (tx_type == DCT_DCT) {

     vp9_idct16x16_add(input, dest, stride, eob);

@@ -1404,3 +1446,1433 @@

     vp9_iht16x16_256_add(input, dest, stride, tx_type);

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+                               int stride, int bd) {

+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

+     0.5 shifts per pixel. */

+  int i;

+  tran_low_t output[16];

+  tran_high_t a1, b1, c1, d1, e1;

+  const tran_low_t *ip = input;

+  tran_low_t *op = output;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] >> UNIT_QUANT_SHIFT;

+    c1 = ip[1] >> UNIT_QUANT_SHIFT;

+    d1 = ip[2] >> UNIT_QUANT_SHIFT;

+    b1 = ip[3] >> UNIT_QUANT_SHIFT;

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    op[0] = WRAPLOW(a1);

+    op[1] = WRAPLOW(b1);

+    op[2] = WRAPLOW(c1);

+    op[3] = WRAPLOW(d1);

+    ip += 4;

+    op += 4;

+  }

+  ip = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[4 * 0];

+    c1 = ip[4 * 1];

+    d1 = ip[4 * 2];

+    b1 = ip[4 * 3];

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);

+    dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);

+    dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);

+    dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);

+    ip++;

+    dest++;

+  }

+}

+static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step[4];

+  tran_high_t temp1, temp2;

+  (void) bd;

+  // stage 1

+  temp1 = (input[0] + input[2]) * cospi_16_64;

+  temp2 = (input[0] - input[2]) * cospi_16_64;

+  step[0] = WRAPLOW(dct_const_round_shift(temp1));

+  step[1] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

+  step[2] = WRAPLOW(dct_const_round_shift(temp1));

+  step[3] = WRAPLOW(dct_const_round_shift(temp2));

+  // stage 2

+  output[0] = WRAPLOW(step[0] + step[3]);

+  output[1] = WRAPLOW(step[1] + step[2]);

+  output[2] = WRAPLOW(step[1] - step[2]);

+  output[3] = WRAPLOW(step[0] - step[3]);

+}

+void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,

+                              int dest_stride, int bd) {

+  int i;

+  tran_high_t a1, e1;

+  tran_low_t tmp[4];

+  const tran_low_t *ip = in;

+  tran_low_t *op = tmp;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  (void) bd;

+  a1 = ip[0] >> UNIT_QUANT_SHIFT;

+  e1 = a1 >> 1;

+  a1 -= e1;

+  op[0] = WRAPLOW(a1);

+  op[1] = op[2] = op[3] = WRAPLOW(e1);

+  ip = tmp;

+  for (i = 0; i < 4; i++) {

+    e1 = ip[0] >> 1;

+    a1 = ip[0] - e1;

+    dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);

+    dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);

+    dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);

+    dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);

+    ip++;

+    dest++;

+  }

+}

+void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+                               int stride, int bd) {

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[4], temp_out[4];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Rows

+  for (i = 0; i < 4; ++i) {

+    high_idct4(input, outptr, bd);

+    input += 4;

+    outptr += 4;

+  }

+  // Columns

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    high_idct4(temp_in, temp_out, bd);

+    for (j = 0; j < 4; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

+  }

+}

+void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                              int dest_stride, int bd) {

+  int i;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  for (i = 0; i < 4; i++) {

+    dest[0] = clip_pixel_bd_high(dest[0], a1, bd);

+    dest[1] = clip_pixel_bd_high(dest[1], a1, bd);

+    dest[2] = clip_pixel_bd_high(dest[2], a1, bd);

+    dest[3] = clip_pixel_bd_high(dest[3], a1, bd);

+    dest += dest_stride;

+  }

+}

+static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step1[8], step2[8];

+  tran_high_t temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[2] = input[4];

+  step1[1] = input[2];

+  step1[3] = input[6];

+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

+  // stage 2 & stage 3 - even half

+  high_idct4(step1, step1, bd);

+  // stage 2 - odd half

+  step2[4] = WRAPLOW(step1[4] + step1[5]);

+  step2[5] = WRAPLOW(step1[4] - step1[5]);

+  step2[6] = WRAPLOW(-step1[6] + step1[7]);

+  step2[7] = WRAPLOW(step1[6] + step1[7]);

+  // stage 3 - odd half

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[7] = step2[7];

+  // stage 4

+  output[0] = WRAPLOW(step1[0] + step1[7]);

+  output[1] = WRAPLOW(step1[1] + step1[6]);

+  output[2] = WRAPLOW(step1[2] + step1[5]);

+  output[3] = WRAPLOW(step1[3] + step1[4]);

+  output[4] = WRAPLOW(step1[3] - step1[4]);

+  output[5] = WRAPLOW(step1[2] - step1[5]);

+  output[6] = WRAPLOW(step1[1] - step1[6]);

+  output[7] = WRAPLOW(step1[0] - step1[7]);

+}

+void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

+                               int stride, int bd) {

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[8], temp_out[8];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows.

+  for (i = 0; i < 8; ++i) {

+    high_idct8(input, outptr, bd);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns.

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    high_idct8(temp_in, temp_out, bd);

+    for (j = 0; j < 8; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],

+                                        ROUND_POWER_OF_TWO(temp_out[j], 5),

+                                        bd);

+  }

+}

+void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                              int stride, int bd) {

+  int i, j;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

+  a1 = ROUND_POWER_OF_TWO(out, 5);

+  for (j = 0; j < 8; ++j) {

+    for (i = 0; i < 8; ++i)

+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);

+    dest += stride;

+  }

+}

+static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_high_t x0 = input[0];

+  tran_high_t x1 = input[1];

+  tran_high_t x2 = input[2];

+  tran_high_t x3 = input[3];

+  (void) bd;

+  if (!(x0 | x1 | x2 | x3)) {

+    vpx_memset(output, 0, 4 * sizeof(*output));

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = x0 - x2 + x3;

+  x0 = s0 + s3 + s5;

+  x1 = s1 - s4 - s6;

+  x2 = sinpi_3_9 * s7;

+  x3 = s2;

+  s0 = x0 + x3;

+  s1 = x1 + x3;

+  s2 = x2;

+  s3 = x0 + x1 - x3;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = WRAPLOW(dct_const_round_shift(s0));

+  output[1] = WRAPLOW(dct_const_round_shift(s1));

+  output[2] = WRAPLOW(dct_const_round_shift(s2));

+  output[3] = WRAPLOW(dct_const_round_shift(s3));

+}

+void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+                              int stride, int tx_type, int bd) {

+  const high_transform_2d IHT_4[] = {

+    { high_idct4, high_idct4  },    // DCT_DCT  = 0

+    { high_iadst4, high_idct4 },    // ADST_DCT = 1

+    { high_idct4, high_iadst4 },    // DCT_ADST = 2

+    { high_iadst4, high_iadst4 }    // ADST_ADST = 3

+  };

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  int i, j;

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[4], temp_out[4];

+  // Inverse transform row vectors.

+  for (i = 0; i < 4; ++i) {

+    IHT_4[tx_type].rows(input, outptr, bd);

+    input  += 4;

+    outptr += 4;

+  }

+  // Inverse transform column vectors.

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    IHT_4[tx_type].cols(temp_in, temp_out, bd);

+    for (j = 0; j < 4; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

+  }

+}

+static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_high_t x0 = input[7];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[5];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[3];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[1];

+  tran_high_t x7 = input[6];

+  (void) bd;

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    vpx_memset(output, 0, 8 * sizeof(*output));

+    return;

+  }

+  // stage 1

+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));

+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));

+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));

+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));

+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));

+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));

+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));

+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));

+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));

+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));

+  // stage 3

+  s2 = cospi_16_64 * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (x6 - x7);

+  x2 = WRAPLOW(dct_const_round_shift(s2));

+  x3 = WRAPLOW(dct_const_round_shift(s3));

+  x6 = WRAPLOW(dct_const_round_shift(s6));

+  x7 = WRAPLOW(dct_const_round_shift(s7));

+  output[0] = WRAPLOW(x0);

+  output[1] = WRAPLOW(-x4);

+  output[2] = WRAPLOW(x6);

+  output[3] = WRAPLOW(-x2);

+  output[4] = WRAPLOW(x3);

+  output[5] = WRAPLOW(-x7);

+  output[6] = WRAPLOW(x5);

+  output[7] = WRAPLOW(-x1);

+}

+static const high_transform_2d HIGH_IHT_8[] = {

+  { high_idct8,  high_idct8  },  // DCT_DCT  = 0

+  { high_iadst8, high_idct8  },  // ADST_DCT = 1

+  { high_idct8,  high_iadst8 },  // DCT_ADST = 2

+  { high_iadst8, high_iadst8 }   // ADST_ADST = 3

+};

+void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

+                              int stride, int tx_type, int bd) {

+  int i, j;

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[8], temp_out[8];

+  const high_transform_2d ht = HIGH_IHT_8[tx_type];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Inverse transform row vectors.

+  for (i = 0; i < 8; ++i) {

+    ht.rows(input, outptr, bd);

+    input += 8;

+    outptr += 8;

+  }

+  // Inverse transform column vectors.

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    ht.cols(temp_in, temp_out, bd);

+    for (j = 0; j < 8; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+  }

+}

+void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,

+                               int stride, int bd) {

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[8], temp_out[8];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows.

+  // Only first 4 row has non-zero coefs.

+  for (i = 0; i < 4; ++i) {

+    high_idct8(input, outptr, bd);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns.

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    high_idct8(temp_in, temp_out, bd);

+    for (j = 0; j < 8; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+  }

+}

+static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step1[16], step2[16];

+  tran_high_t temp1, temp2;

+  (void) bd;

+  // stage 1

+  step1[0] = input[0/2];

+  step1[1] = input[16/2];

+  step1[2] = input[8/2];

+  step1[3] = input[24/2];

+  step1[4] = input[4/2];

+  step1[5] = input[20/2];

+  step1[6] = input[12/2];

+  step1[7] = input[28/2];

+  step1[8] = input[2/2];

+  step1[9] = input[18/2];

+  step1[10] = input[10/2];

+  step1[11] = input[26/2];

+  step1[12] = input[6/2];

+  step1[13] = input[22/2];

+  step1[14] = input[14/2];

+  step1[15] = input[30/2];

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[8] = WRAPLOW(step2[8] + step2[9]);

+  step1[9] = WRAPLOW(step2[8] - step2[9]);

+  step1[10] = WRAPLOW(-step2[10] + step2[11]);

+  step1[11] = WRAPLOW(step2[10] + step2[11]);

+  step1[12] = WRAPLOW(step2[12] + step2[13]);

+  step1[13] = WRAPLOW(step2[12] - step2[13]);

+  step1[14] = WRAPLOW(-step2[14] + step2[15]);

+  step1[15] = WRAPLOW(step2[14] + step2[15]);

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));

+  step2[4] = WRAPLOW(step1[4] + step1[5]);

+  step2[5] = WRAPLOW(step1[4] - step1[5]);

+  step2[6] = WRAPLOW(-step1[6] + step1[7]);

+  step2[7] = WRAPLOW(step1[6] + step1[7]);

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  // stage 5

+  step1[0] = WRAPLOW(step2[0] + step2[3]);

+  step1[1] = WRAPLOW(step2[1] + step2[2]);

+  step1[2] = WRAPLOW(step2[1] - step2[2]);

+  step1[3] = WRAPLOW(step2[0] - step2[3]);

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[7] = step2[7];

+  step1[8] = WRAPLOW(step2[8] + step2[11]);

+  step1[9] = WRAPLOW(step2[9] + step2[10]);

+  step1[10] = WRAPLOW(step2[9] - step2[10]);

+  step1[11] = WRAPLOW(step2[8] - step2[11]);

+  step1[12] = WRAPLOW(-step2[12] + step2[15]);

+  step1[13] = WRAPLOW(-step2[13] + step2[14]);

+  step1[14] = WRAPLOW(step2[13] + step2[14]);

+  step1[15] = WRAPLOW(step2[12] + step2[15]);

+  // stage 6

+  step2[0] = WRAPLOW(step1[0] + step1[7]);

+  step2[1] = WRAPLOW(step1[1] + step1[6]);

+  step2[2] = WRAPLOW(step1[2] + step1[5]);

+  step2[3] = WRAPLOW(step1[3] + step1[4]);

+  step2[4] = WRAPLOW(step1[3] - step1[4]);

+  step2[5] = WRAPLOW(step1[2] - step1[5]);

+  step2[6] = WRAPLOW(step1[1] - step1[6]);

+  step2[7] = WRAPLOW(step1[0] - step1[7]);

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  // stage 7

+  output[0] = WRAPLOW(step2[0] + step2[15]);

+  output[1] = WRAPLOW(step2[1] + step2[14]);

+  output[2] = WRAPLOW(step2[2] + step2[13]);

+  output[3] = WRAPLOW(step2[3] + step2[12]);

+  output[4] = WRAPLOW(step2[4] + step2[11]);

+  output[5] = WRAPLOW(step2[5] + step2[10]);

+  output[6] = WRAPLOW(step2[6] + step2[9]);

+  output[7] = WRAPLOW(step2[7] + step2[8]);

+  output[8] = WRAPLOW(step2[7] - step2[8]);

+  output[9] = WRAPLOW(step2[6] - step2[9]);

+  output[10] = WRAPLOW(step2[5] - step2[10]);

+  output[11] = WRAPLOW(step2[4] - step2[11]);

+  output[12] = WRAPLOW(step2[3] - step2[12]);

+  output[13] = WRAPLOW(step2[2] - step2[13]);

+  output[14] = WRAPLOW(step2[1] - step2[14]);

+  output[15] = WRAPLOW(step2[0] - step2[15]);

+}

+void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

+                                  int stride, int bd) {

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[16], temp_out[16];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows.

+  for (i = 0; i < 16; ++i) {

+    high_idct16(input, outptr, bd);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns.

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    high_idct16(temp_in, temp_out, bd);

+    for (j = 0; j < 16; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+  }

+}

+static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

+  tran_high_t s9, s10, s11, s12, s13, s14, s15;

+  tran_high_t x0 = input[15];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[13];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[11];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[9];

+  tran_high_t x7 = input[6];

+  tran_high_t x8 = input[7];

+  tran_high_t x9 = input[8];

+  tran_high_t x10 = input[5];

+  tran_high_t x11 = input[10];

+  tran_high_t x12 = input[3];

+  tran_high_t x13 = input[12];

+  tran_high_t x14 = input[1];

+  tran_high_t x15 = input[14];

+  (void) bd;

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

+    vpx_memset(output, 0, 16 * sizeof(*output));

+    return;

+  }

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));

+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));

+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));

+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));

+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));

+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));

+  x8  = WRAPLOW(dct_const_round_shift(s0 - s8));

+  x9  = WRAPLOW(dct_const_round_shift(s1 - s9));

+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));

+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));

+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));

+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));

+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));

+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = WRAPLOW(s0 + s4);

+  x1 = WRAPLOW(s1 + s5);

+  x2 = WRAPLOW(s2 + s6);

+  x3 = WRAPLOW(s3 + s7);

+  x4 = WRAPLOW(s0 - s4);

+  x5 = WRAPLOW(s1 - s5);

+  x6 = WRAPLOW(s2 - s6);

+  x7 = WRAPLOW(s3 - s7);

+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));

+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));

+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));

+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));

+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));

+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));

+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));

+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

+  x0 = WRAPLOW(s0 + s2);

+  x1 = WRAPLOW(s1 + s3);

+  x2 = WRAPLOW(s0 - s2);

+  x3 = WRAPLOW(s1 - s3);

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));

+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));

+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));

+  x8 = WRAPLOW(s8 + s10);

+  x9 = WRAPLOW(s9 + s11);

+  x10 = WRAPLOW(s8 - s10);

+  x11 = WRAPLOW(s9 - s11);

+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));

+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));

+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));

+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (-x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (-x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

+  x2 = WRAPLOW(dct_const_round_shift(s2));

+  x3 = WRAPLOW(dct_const_round_shift(s3));

+  x6 = WRAPLOW(dct_const_round_shift(s6));

+  x7 = WRAPLOW(dct_const_round_shift(s7));

+  x10 = WRAPLOW(dct_const_round_shift(s10));

+  x11 = WRAPLOW(dct_const_round_shift(s11));

+  x14 = WRAPLOW(dct_const_round_shift(s14));

+  x15 = WRAPLOW(dct_const_round_shift(s15));

+  output[0] = WRAPLOW(x0);

+  output[1] = WRAPLOW(-x8);

+  output[2] = WRAPLOW(x12);

+  output[3] = WRAPLOW(-x4);

+  output[4] = WRAPLOW(x6);

+  output[5] = WRAPLOW(x14);

+  output[6] = WRAPLOW(x10);

+  output[7] = WRAPLOW(x2);

+  output[8] = WRAPLOW(x3);

+  output[9] = WRAPLOW(x11);

+  output[10] = WRAPLOW(x15);

+  output[11] = WRAPLOW(x7);

+  output[12] = WRAPLOW(x5);

+  output[13] = WRAPLOW(-x13);

+  output[14] = WRAPLOW(x9);

+  output[15] = WRAPLOW(-x1);

+}

+static const high_transform_2d HIGH_IHT_16[] = {

+  { high_idct16,  high_idct16  },  // DCT_DCT  = 0

+  { high_iadst16, high_idct16  },  // ADST_DCT = 1

+  { high_idct16,  high_iadst16 },  // DCT_ADST = 2

+  { high_iadst16, high_iadst16 }   // ADST_ADST = 3

+};

+void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int tx_type, int bd) {

+  int i, j;

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  tran_low_t temp_in[16], temp_out[16];

+  const high_transform_2d ht = HIGH_IHT_16[tx_type];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Rows

+  for (i = 0; i < 16; ++i) {

+    ht.rows(input, outptr, bd);

+    input += 16;

+    outptr += 16;

+  }

+  // Columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    ht.cols(temp_in, temp_out, bd);

+    for (j = 0; j < 16; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+  }

+}

+void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int bd) {

+  tran_low_t out[16 * 16] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[16], temp_out[16];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows. Since all non-zero dct coefficients are in

+  // upper-left 4x4 area, we only need to calculate first 4 rows here.

+  for (i = 0; i < 4; ++i) {

+    high_idct16(input, outptr, bd);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns.

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j*16 + i];

+    high_idct16(temp_in, temp_out, bd);

+    for (j = 0; j < 16; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+  }

+}

+void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                                int stride, int bd) {

+  int i, j;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

+  a1 = ROUND_POWER_OF_TWO(out, 6);

+  for (j = 0; j < 16; ++j) {

+    for (i = 0; i < 16; ++i)

+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);

+    dest += stride;

+  }

+}

+static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step1[32], step2[32];

+  tran_high_t temp1, temp2;

+  (void) bd;

+  // stage 1

+  step1[0] = input[0];

+  step1[1] = input[16];

+  step1[2] = input[8];

+  step1[3] = input[24];

+  step1[4] = input[4];

+  step1[5] = input[20];

+  step1[6] = input[12];

+  step1[7] = input[28];

+  step1[8] = input[2];

+  step1[9] = input[18];

+  step1[10] = input[10];

+  step1[11] = input[26];

+  step1[12] = input[6];

+  step1[13] = input[22];

+  step1[14] = input[14];

+  step1[15] = input[30];

+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));

+  step2[16] = WRAPLOW(step1[16] + step1[17]);

+  step2[17] = WRAPLOW(step1[16] - step1[17]);

+  step2[18] = WRAPLOW(-step1[18] + step1[19]);

+  step2[19] = WRAPLOW(step1[18] + step1[19]);

+  step2[20] = WRAPLOW(step1[20] + step1[21]);

+  step2[21] = WRAPLOW(step1[20] - step1[21]);

+  step2[22] = WRAPLOW(-step1[22] + step1[23]);

+  step2[23] = WRAPLOW(step1[22] + step1[23]);

+  step2[24] = WRAPLOW(step1[24] + step1[25]);

+  step2[25] = WRAPLOW(step1[24] - step1[25]);

+  step2[26] = WRAPLOW(-step1[26] + step1[27]);

+  step2[27] = WRAPLOW(step1[26] + step1[27]);

+  step2[28] = WRAPLOW(step1[28] + step1[29]);

+  step2[29] = WRAPLOW(step1[28] - step1[29]);

+  step2[30] = WRAPLOW(-step1[30] + step1[31]);

+  step2[31] = WRAPLOW(step1[30] + step1[31]);

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[8] = WRAPLOW(step2[8] + step2[9]);

+  step1[9] = WRAPLOW(step2[8] - step2[9]);

+  step1[10] = WRAPLOW(-step2[10] + step2[11]);

+  step1[11] = WRAPLOW(step2[10] + step2[11]);

+  step1[12] = WRAPLOW(step2[12] + step2[13]);

+  step1[13] = WRAPLOW(step2[12] - step2[13]);

+  step1[14] = WRAPLOW(-step2[14] + step2[15]);

+  step1[15] = WRAPLOW(step2[14] + step2[15]);

+  step1[16] = step2[16];

+  step1[31] = step2[31];

+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[19] = step2[19];

+  step1[20] = step2[20];

+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[27] = step2[27];

+  step1[28] = step2[28];

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));

+  step2[4] = WRAPLOW(step1[4] + step1[5]);

+  step2[5] = WRAPLOW(step1[4] - step1[5]);

+  step2[6] = WRAPLOW(-step1[6] + step1[7]);

+  step2[7] = WRAPLOW(step1[6] + step1[7]);

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  step2[16] = WRAPLOW(step1[16] + step1[19]);

+  step2[17] = WRAPLOW(step1[17] + step1[18]);

+  step2[18] = WRAPLOW(step1[17] - step1[18]);

+  step2[19] = WRAPLOW(step1[16] - step1[19]);

+  step2[20] = WRAPLOW(-step1[20] + step1[23]);

+  step2[21] = WRAPLOW(-step1[21] + step1[22]);

+  step2[22] = WRAPLOW(step1[21] + step1[22]);

+  step2[23] = WRAPLOW(step1[20] + step1[23]);

+  step2[24] = WRAPLOW(step1[24] + step1[27]);

+  step2[25] = WRAPLOW(step1[25] + step1[26]);

+  step2[26] = WRAPLOW(step1[25] - step1[26]);

+  step2[27] = WRAPLOW(step1[24] - step1[27]);

+  step2[28] = WRAPLOW(-step1[28] + step1[31]);

+  step2[29] = WRAPLOW(-step1[29] + step1[30]);

+  step2[30] = WRAPLOW(step1[29] + step1[30]);

+  step2[31] = WRAPLOW(step1[28] + step1[31]);

+  // stage 5

+  step1[0] = WRAPLOW(step2[0] + step2[3]);

+  step1[1] = WRAPLOW(step2[1] + step2[2]);

+  step1[2] = WRAPLOW(step2[1] - step2[2]);

+  step1[3] = WRAPLOW(step2[0] - step2[3]);

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[7] = step2[7];

+  step1[8] = WRAPLOW(step2[8] + step2[11]);

+  step1[9] = WRAPLOW(step2[9] + step2[10]);

+  step1[10] = WRAPLOW(step2[9] - step2[10]);

+  step1[11] = WRAPLOW(step2[8] - step2[11]);

+  step1[12] = WRAPLOW(-step2[12] + step2[15]);

+  step1[13] = WRAPLOW(-step2[13] + step2[14]);

+  step1[14] = WRAPLOW(step2[13] + step2[14]);

+  step1[15] = WRAPLOW(step2[12] + step2[15]);

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[22] = step2[22];

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[25] = step2[25];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // stage 6

+  step2[0] = WRAPLOW(step1[0] + step1[7]);

+  step2[1] = WRAPLOW(step1[1] + step1[6]);

+  step2[2] = WRAPLOW(step1[2] + step1[5]);

+  step2[3] = WRAPLOW(step1[3] + step1[4]);

+  step2[4] = WRAPLOW(step1[3] - step1[4]);

+  step2[5] = WRAPLOW(step1[2] - step1[5]);

+  step2[6] = WRAPLOW(step1[1] - step1[6]);

+  step2[7] = WRAPLOW(step1[0] - step1[7]);

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));

+  step2[14] = WRAPLOW(step1[14]);

+  step2[15] = WRAPLOW(step1[15]);

+  step2[16] = WRAPLOW(step1[16] + step1[23]);

+  step2[17] = WRAPLOW(step1[17] + step1[22]);

+  step2[18] = WRAPLOW(step1[18] + step1[21]);

+  step2[19] = WRAPLOW(step1[19] + step1[20]);

+  step2[20] = WRAPLOW(step1[19] - step1[20]);

+  step2[21] = WRAPLOW(step1[18] - step1[21]);

+  step2[22] = WRAPLOW(step1[17] - step1[22]);

+  step2[23] = WRAPLOW(step1[16] - step1[23]);

+  step2[24] = WRAPLOW(-step1[24] + step1[31]);

+  step2[25] = WRAPLOW(-step1[25] + step1[30]);

+  step2[26] = WRAPLOW(-step1[26] + step1[29]);

+  step2[27] = WRAPLOW(-step1[27] + step1[28]);

+  step2[28] = WRAPLOW(step1[27] + step1[28]);

+  step2[29] = WRAPLOW(step1[26] + step1[29]);

+  step2[30] = WRAPLOW(step1[25] + step1[30]);

+  step2[31] = WRAPLOW(step1[24] + step1[31]);

+  // stage 7

+  step1[0] = WRAPLOW(step2[0] + step2[15]);

+  step1[1] = WRAPLOW(step2[1] + step2[14]);

+  step1[2] = WRAPLOW(step2[2] + step2[13]);

+  step1[3] = WRAPLOW(step2[3] + step2[12]);

+  step1[4] = WRAPLOW(step2[4] + step2[11]);

+  step1[5] = WRAPLOW(step2[5] + step2[10]);

+  step1[6] = WRAPLOW(step2[6] + step2[9]);

+  step1[7] = WRAPLOW(step2[7] + step2[8]);

+  step1[8] = WRAPLOW(step2[7] - step2[8]);

+  step1[9] = WRAPLOW(step2[6] - step2[9]);

+  step1[10] = WRAPLOW(step2[5] - step2[10]);

+  step1[11] = WRAPLOW(step2[4] - step2[11]);

+  step1[12] = WRAPLOW(step2[3] - step2[12]);

+  step1[13] = WRAPLOW(step2[2] - step2[13]);

+  step1[14] = WRAPLOW(step2[1] - step2[14]);

+  step1[15] = WRAPLOW(step2[0] - step2[15]);

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  step1[18] = step2[18];

+  step1[19] = step2[19];

+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

+  temp2 = (step2[20] + step2[27]) * cospi_16_64;

+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

+  temp2 = (step2[21] + step2[26]) * cospi_16_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

+  temp2 = (step2[22] + step2[25]) * cospi_16_64;

+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));

+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

+  temp2 = (step2[23] + step2[24]) * cospi_16_64;

+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));

+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));

+  step1[28] = step2[28];

+  step1[29] = step2[29];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // final stage

+  output[0] = WRAPLOW(step1[0] + step1[31]);

+  output[1] = WRAPLOW(step1[1] + step1[30]);

+  output[2] = WRAPLOW(step1[2] + step1[29]);

+  output[3] = WRAPLOW(step1[3] + step1[28]);

+  output[4] = WRAPLOW(step1[4] + step1[27]);

+  output[5] = WRAPLOW(step1[5] + step1[26]);

+  output[6] = WRAPLOW(step1[6] + step1[25]);

+  output[7] = WRAPLOW(step1[7] + step1[24]);

+  output[8] = WRAPLOW(step1[8] + step1[23]);

+  output[9] = WRAPLOW(step1[9] + step1[22]);

+  output[10] = WRAPLOW(step1[10] + step1[21]);

+  output[11] = WRAPLOW(step1[11] + step1[20]);

+  output[12] = WRAPLOW(step1[12] + step1[19]);

+  output[13] = WRAPLOW(step1[13] + step1[18]);

+  output[14] = WRAPLOW(step1[14] + step1[17]);

+  output[15] = WRAPLOW(step1[15] + step1[16]);

+  output[16] = WRAPLOW(step1[15] - step1[16]);

+  output[17] = WRAPLOW(step1[14] - step1[17]);

+  output[18] = WRAPLOW(step1[13] - step1[18]);

+  output[19] = WRAPLOW(step1[12] - step1[19]);

+  output[20] = WRAPLOW(step1[11] - step1[20]);

+  output[21] = WRAPLOW(step1[10] - step1[21]);

+  output[22] = WRAPLOW(step1[9] - step1[22]);

+  output[23] = WRAPLOW(step1[8] - step1[23]);

+  output[24] = WRAPLOW(step1[7] - step1[24]);

+  output[25] = WRAPLOW(step1[6] - step1[25]);

+  output[26] = WRAPLOW(step1[5] - step1[26]);

+  output[27] = WRAPLOW(step1[4] - step1[27]);

+  output[28] = WRAPLOW(step1[3] - step1[28]);

+  output[29] = WRAPLOW(step1[2] - step1[29]);

+  output[30] = WRAPLOW(step1[1] - step1[30]);

+  output[31] = WRAPLOW(step1[0] - step1[31]);

+}

+void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,

+                                   int stride, int bd) {

+  tran_low_t out[32 * 32];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[32], temp_out[32];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Rows

+  for (i = 0; i < 32; ++i) {

+    tran_low_t zero_coeff[16];

+    for (j = 0; j < 16; ++j)

+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];

+    for (j = 0; j < 8; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    for (j = 0; j < 4; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    for (j = 0; j < 2; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    if (zero_coeff[0] | zero_coeff[1])

+      high_idct32(input, outptr, bd);

+    else

+      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    high_idct32(temp_in, temp_out, bd);

+    for (j = 0; j < 32; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+  }

+}

+void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int bd) {

+  tran_low_t out[32 * 32] = {0};

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[32], temp_out[32];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Rows

+  // Only upper-left 8x8 has non-zero coeff.

+  for (i = 0; i < 8; ++i) {

+    high_idct32(input, outptr, bd);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    high_idct32(temp_in, temp_out, bd);

+    for (j = 0; j < 32; ++j)

+      dest[j * stride + i] = clip_pixel_bd_high(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+  }

+}

+void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                                int stride, int bd) {

+  int i, j;

+  int a1;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

+  a1 = ROUND_POWER_OF_TWO(out, 6);

+  for (j = 0; j < 32; ++j) {

+    for (i = 0; i < 32; ++i)

+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);

+    dest += stride;

+  }

+}

+// idct

+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                          int eob, int bd) {

+  if (eob > 1)

+    vp9_high_idct4x4_16_add(input, dest, stride, bd);

+  else

+    vp9_high_idct4x4_1_add(input, dest, stride, bd);

+}

+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                          int eob, int bd) {

+  if (eob > 1)

+    vp9_high_iwht4x4_16_add(input, dest, stride, bd);

+  else

+    vp9_high_iwht4x4_1_add(input, dest, stride, bd);

+}

+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,

+                          int eob, int bd) {

+  // If dc is 1, then input[0] is the reconstructed value, do not need

+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

+  // The calculation can be simplified if there are not many non-zero dct

+  // coefficients. Use eobs to decide what to do.

+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

+  // Combine that with code here.

+  // DC only DCT coefficient

+  if (eob == 1) {

+    vp9_high_idct8x8_1_add(input, dest, stride, bd);

+  } else if (eob <= 10) {

+    vp9_high_idct8x8_10_add(input, dest, stride, bd);

+  } else {

+    vp9_high_idct8x8_64_add(input, dest, stride, bd);

+  }

+}

+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,

+                       int eob, int bd) {

+  // The calculation can be simplified if there are not many non-zero dct

+  // coefficients. Use eobs to separate different cases.

+  // DC only DCT coefficient.

+  if (eob == 1) {

+    vp9_high_idct16x16_1_add(input, dest, stride, bd);

+  } else if (eob <= 10) {

+    vp9_high_idct16x16_10_add(input, dest, stride, bd);

+  } else {

+    vp9_high_idct16x16_256_add(input, dest, stride, bd);

+  }

+}

+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,

+                       int eob, int bd) {

+  // Non-zero coeff only in upper-left 8x8

+  if (eob == 1) {

+    vp9_high_idct32x32_1_add(input, dest, stride, bd);

+  } else if (eob <= 34) {

+    vp9_high_idct32x32_34_add(input, dest, stride, bd);

+  } else {

+    vp9_high_idct32x32_1024_add(input, dest, stride, bd);

+  }

+}

+// iht

+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,

+                         uint8_t *dest, int stride, int eob, int bd) {

+  if (tx_type == DCT_DCT)

+    vp9_high_idct4x4_add(input, dest, stride, eob, bd);

+  else

+    vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);

+}

+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,

+                         uint8_t *dest, int stride, int eob, int bd) {

+  if (tx_type == DCT_DCT) {

+    vp9_high_idct8x8_add(input, dest, stride, eob, bd);

+  } else {

+    vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);

+  }

+}

+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

+                           uint8_t *dest, int stride, int eob, int bd) {

+  if (tx_type == DCT_DCT) {

+    vp9_high_idct16x16_add(input, dest, stride, eob, bd);

+  } else {

+    vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -36,52 +36,69 @@

 #define dual_set_epi16(a, b) \

   _mm_set_epi16(b, b, b, b, a, a, a, a)

+// Note:

+// tran_low_t  is the datatype used for final transform coefficients.

+// tran_high_t is the datatype used for intermediate transform stages.

+#if CONFIG_VP9_HIGHBITDEPTH

+typedef int64_t tran_high_t;

+typedef int32_t tran_low_t;

+#else

+typedef int32_t tran_high_t;

+typedef int16_t tran_low_t;

+#endif

 // Constants:

 //  for (int i = 1; i< 32; ++i)

 //    printf("static const int cospi_%d_64 = %.0f;\n", i,

 //           round(16384 * cos(i*M_PI/64)));

 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)

-static const int cospi_1_64  = 16364;

-static const int cospi_2_64  = 16305;

-static const int cospi_3_64  = 16207;

-static const int cospi_4_64  = 16069;

-static const int cospi_5_64  = 15893;

-static const int cospi_6_64  = 15679;

-static const int cospi_7_64  = 15426;

-static const int cospi_8_64  = 15137;

-static const int cospi_9_64  = 14811;

-static const int cospi_10_64 = 14449;

-static const int cospi_11_64 = 14053;

-static const int cospi_12_64 = 13623;

-static const int cospi_13_64 = 13160;

-static const int cospi_14_64 = 12665;

-static const int cospi_15_64 = 12140;

-static const int cospi_16_64 = 11585;

-static const int cospi_17_64 = 11003;

-static const int cospi_18_64 = 10394;

-static const int cospi_19_64 = 9760;

-static const int cospi_20_64 = 9102;

-static const int cospi_21_64 = 8423;

-static const int cospi_22_64 = 7723;

-static const int cospi_23_64 = 7005;

-static const int cospi_24_64 = 6270;

-static const int cospi_25_64 = 5520;

-static const int cospi_26_64 = 4756;

-static const int cospi_27_64 = 3981;

-static const int cospi_28_64 = 3196;

-static const int cospi_29_64 = 2404;

-static const int cospi_30_64 = 1606;

-static const int cospi_31_64 = 804;

+static const tran_high_t cospi_1_64  = 16364;

+static const tran_high_t cospi_2_64  = 16305;

+static const tran_high_t cospi_3_64  = 16207;

+static const tran_high_t cospi_4_64  = 16069;

+static const tran_high_t cospi_5_64  = 15893;

+static const tran_high_t cospi_6_64  = 15679;

+static const tran_high_t cospi_7_64  = 15426;

+static const tran_high_t cospi_8_64  = 15137;

+static const tran_high_t cospi_9_64  = 14811;

+static const tran_high_t cospi_10_64 = 14449;

+static const tran_high_t cospi_11_64 = 14053;

+static const tran_high_t cospi_12_64 = 13623;

+static const tran_high_t cospi_13_64 = 13160;

+static const tran_high_t cospi_14_64 = 12665;

+static const tran_high_t cospi_15_64 = 12140;

+static const tran_high_t cospi_16_64 = 11585;

+static const tran_high_t cospi_17_64 = 11003;

+static const tran_high_t cospi_18_64 = 10394;

+static const tran_high_t cospi_19_64 = 9760;

+static const tran_high_t cospi_20_64 = 9102;

+static const tran_high_t cospi_21_64 = 8423;

+static const tran_high_t cospi_22_64 = 7723;

+static const tran_high_t cospi_23_64 = 7005;

+static const tran_high_t cospi_24_64 = 6270;

+static const tran_high_t cospi_25_64 = 5520;

+static const tran_high_t cospi_26_64 = 4756;

+static const tran_high_t cospi_27_64 = 3981;

+static const tran_high_t cospi_28_64 = 3196;

+static const tran_high_t cospi_29_64 = 2404;

+static const tran_high_t cospi_30_64 = 1606;

+static const tran_high_t cospi_31_64 = 804;

 //  16384 * sqrt(2) * sin(kPi/9) * 2 / 3

-static const int sinpi_1_9 = 5283;

-static const int sinpi_2_9 = 9929;

-static const int sinpi_3_9 = 13377;

-static const int sinpi_4_9 = 15212;

+static const tran_high_t sinpi_1_9 = 5283;

+static const tran_high_t sinpi_2_9 = 9929;

+static const tran_high_t sinpi_3_9 = 13377;

+static const tran_high_t sinpi_4_9 = 15212;

-static INLINE int dct_const_round_shift(int input) {

-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

-#if CONFIG_COEFFICIENT_RANGE_CHECKING

+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {

+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

+#if CONFIG_VP9_HIGHBITDEPTH

+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will

+  // stay within the ranges:

+  // - 8 bit: signed 16 bit integer

+  // - 10 bit: signed 18 bit integer

+  // - 12 bit: signed 20 bit integer

+#elif CONFIG_COEFFICIENT_RANGE_CHECKING

   // For valid VP9 input streams, intermediate stage coefficients should always

   // stay within the range of a signed 16 bit integer. Coefficients can go out

   // of this range for invalid/corrupt VP9 streams. However, strictly checking

@@ -91,32 +108,59 @@

   assert(INT16_MIN <= rv);

   assert(rv <= INT16_MAX);

 #endif

-  return (int16_t)rv;

+  return (tran_low_t)rv;

-typedef void (*transform_1d)(const int16_t*, int16_t*);

+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);

 typedef struct {

   transform_1d cols, rows;  // vertical and horizontal

 } transform_2d;

-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);

+#if CONFIG_VP9_HIGHBITDEPTH

+typedef void (*high_transform_1d)(const tran_low_t*, tran_low_t*, int bd);

-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);

-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);

-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int

+typedef struct {

+  high_transform_1d cols, rows;  // vertical and horizontal

+} high_transform_2d;

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                     int eob);

+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                     int eob);

+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,

+                     int eob);

+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int

                        eob);

-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,

+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,

                        int eob);

-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,

                     int stride, int eob);

-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,

                     int stride, int eob);

-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,

                       int stride, int eob);

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                          int eob, int bd);

+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+                          int eob, int bd);

+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,

+                          int eob, int bd);

+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,

+                            int eob, int bd);

+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,

+                            int eob, int bd);

+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,

+                         uint8_t *dest, int stride, int eob, int bd);

+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,

+                         uint8_t *dest, int stride, int eob, int bd);

+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

+                           uint8_t *dest, int stride, int eob, int bd);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 #ifdef __cplusplus

 }  // extern "C"

 #endif

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -6,6 +6,7 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_enums.h"

+#include "vp9/common/vp9_idct.h"

 struct macroblockd;

@@ -329,68 +330,177 @@

 # dct

-add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;

-$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

+  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct4x4_1_add/;

-add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;

-$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;

+  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct4x4_16_add/;

-add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;

-$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;

+  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_1_add/;

-add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";

-$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;

+  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_64_add/;

-add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";

-$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;

+  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_12_add/;

-add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;

-$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;

+  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_1_add/;

-add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;

-$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;

+  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_256_add/;

-add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;

-$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;

+  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_10_add/;

-add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;

-$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;

+  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_1024_add/;

-add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;

-$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;

+  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_34_add/;

-add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;

-$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;

+  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_1_add/;

-add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";

-specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;

-$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;

+  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";

+  specialize qw/vp9_iht4x4_16_add/;

-add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";

-specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;

-$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;

+  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";

+  specialize qw/vp9_iht8x8_64_add/;

-add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";

-specialize qw/vp9_iht16x16_256_add sse2 dspr2/;

+  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";

+  specialize qw/vp9_iht16x16_256_add/;

+  # dct and add

+  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_iwht4x4_1_add/;

+  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_iwht4x4_16_add/;

+} else {

+  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;

+  $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;

+  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;

+  $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;

+  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;

+  $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;

+  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";

+  $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;

+  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";

+  $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;

+  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;

+  $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;

+  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;

+  $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;

+  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;

+  $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;

+  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;

+  $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;

+  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;

+  $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;

+  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;

+  $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;

+  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";

+  specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;

+  $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;

+  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";

+  specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;

+  $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;

+  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";

+  specialize qw/vp9_iht16x16_256_add sse2 dspr2/;

+  # dct and add

+  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_iwht4x4_1_add/;

+  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_iwht4x4_16_add/;

+}

+# High bitdepth functions

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

+#

+# dct

+#

+add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct4x4_1_add/;

+add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct4x4_16_add/;

+add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct8x8_1_add/;

+add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct8x8_64_add/;

+add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct8x8_10_add/;

+add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct16x16_1_add/;

+add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct16x16_256_add/;

+add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct16x16_10_add/;

+add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct32x32_1024_add/;

+add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct32x32_34_add/;

+add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_idct32x32_1_add/;

+add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";

+specialize qw/vp9_high_iht4x4_16_add/;

+add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";

+specialize qw/vp9_high_iht8x8_64_add/;

+add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";

+specialize qw/vp9_high_iht16x16_256_add/;

 # dct and add

-add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_iwht4x4_1_add/;

+add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_iwht4x4_1_add/;

-add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";

-specialize qw/vp9_iwht4x4_16_add/;

+add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+specialize qw/vp9_high_iwht4x4_16_add/;

+}

 # Encoder functions below this point.

@@ -706,24 +816,43 @@

 specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";

 # ENCODEMB INVOKE

-add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";

-specialize qw/vp9_block_error avx2/, "$sse2_x86inc";

 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";

 specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";

-add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

+# the transform coefficients are held in 32-bit

+# values, so the assembler code for  vp9_block_error can no longer be used.

+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";

+  specialize qw/vp9_block_error/;

-add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";

+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_fp/;

-add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_fp_32x32/;

-add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";

+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b/;

+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b_32x32/;

+} else {

+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";

+  specialize qw/vp9_block_error avx2/;

+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";

+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";

+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_quantize_b_32x32/;

+}

 # Structured Similarity (SSIM)

@@ -736,45 +865,87 @@

 # fdct functions

-add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";

-specialize qw/vp9_fht4x4 sse2/;

-add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";

-specialize qw/vp9_fht8x8 sse2/;

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_fht4x4/;

-add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";

-specialize qw/vp9_fht16x16 sse2/;

+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_fht8x8/;

-add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fwht4x4/, "$mmx_x86inc";

+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_fht16x16/;

-add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct4x4_1 sse2/;

+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fwht4x4/;

-add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct4x4 sse2/;

+  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct4x4_1/;

-add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct8x8_1 sse2 neon/;

+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct4x4/;

-add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";

+  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct8x8_1/;

-add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct16x16_1 sse2/;

+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct8x8/;

-add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct16x16 sse2/;

+  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct16x16_1/;

-add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct32x32_1 sse2/;

+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct16x16/;

-add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct32x32 sse2 avx2/;

+  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct32x32_1/;

-add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fdct32x32_rd sse2 avx2/;

+  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct32x32/;

+  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct32x32_rd/;

+} else {

+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_fht4x4 sse2/;

+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_fht8x8 sse2/;

+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_fht16x16 sse2/;

+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";

+  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct4x4_1 sse2/;

+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct4x4 sse2/;

+  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct8x8_1 sse2 neon/;

+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";

+  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct16x16_1 sse2/;

+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct16x16 sse2/;

+  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct32x32_1 sse2/;

+  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct32x32 sse2 avx2/;

+  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_fdct32x32_rd sse2 avx2/;

+}

 # Motion search

@@ -1369,7 +1540,79 @@

   add_proto qw/unsigned int vp9_high_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

   specialize qw/vp9_high_12_mse8x8/;

+  # ENCODEMB INVOKE

+  add_proto qw/int64_t vp9_high_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";

+  specialize qw/vp9_high_block_error/;

+  add_proto qw/void vp9_high_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";

+  specialize qw/vp9_high_subtract_block/;

+  add_proto qw/void vp9_high_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_high_quantize_fp/;

+  add_proto qw/void vp9_high_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_high_quantize_fp_32x32/;

+  add_proto qw/void vp9_high_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_high_quantize_b/;

+  add_proto qw/void vp9_high_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+  specialize qw/vp9_high_quantize_b_32x32/;

+  #

+  # Structured Similarity (SSIM)

+  #

+  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {

+    add_proto qw/void vp9_high_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";

+    specialize qw/vp9_high_ssim_parms_8x8/;

+    add_proto qw/void vp9_high_ssim_parms_8x8_shift/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr, unsigned int bd, unsigned int shift";

+    specialize qw/vp9_high_ssim_parms_8x8_shift/;

+  }

+  # fdct functions

+  add_proto qw/void vp9_high_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_high_fht4x4/;

+  add_proto qw/void vp9_high_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_high_fht8x8/;

+  add_proto qw/void vp9_high_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

+  specialize qw/vp9_high_fht16x16/;

+  add_proto qw/void vp9_high_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fwht4x4/;

+  add_proto qw/void vp9_high_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct4x4/;

+  add_proto qw/void vp9_high_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct8x8_1/;

+  add_proto qw/void vp9_high_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct8x8/;

+  add_proto qw/void vp9_high_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct16x16_1/;

+  add_proto qw/void vp9_high_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct16x16/;

+  add_proto qw/void vp9_high_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct32x32_1/;

+  add_proto qw/void vp9_high_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct32x32/;

+  add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";

+  specialize qw/vp9_high_fdct32x32_rd/;

+  add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";

+  specialize qw/vp9_high_temporal_filter_apply/;

+# End vp9_high encoder functions

 # end encoder functions

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -195,7 +195,7 @@

   struct macroblockd_plane *const pd = &xd->plane[plane];

   if (eob > 0) {

     TX_TYPE tx_type = DCT_DCT;

-    int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

     if (xd->lossless) {

       tx_type = DCT_DCT;

       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);

--- a/vp9/decoder/vp9_detokenize.c

+++ b/vp9/decoder/vp9_detokenize.c

@@ -51,7 +51,7 @@

   } while (0)

 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,

-                       int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,

+                       tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,

                        int ctx, const int16_t *scan, const int16_t *nb,

                        vp9_reader *r) {

   const int max_eob = 16 << (tx_size << 1);

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -28,8 +28,8 @@

 struct macroblock_plane {

   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);

-  int16_t *qcoeff;

-  int16_t *coeff;

+  tran_low_t *qcoeff;

+  tran_low_t *coeff;

   uint16_t *eobs;

   struct buf_2d src;

@@ -119,8 +119,12 @@

   // Used to store sub partition's choices.

   MV pred_mv[MAX_REF_FRAMES];

-  void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);

-  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);

+  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);

+  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);

+#if CONFIG_VP9_HIGHBITDEPTH

+  void (*high_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,

+                        int eob, int bd);

+#endif

};

 #ifdef __cplusplus

--- a/vp9/encoder/vp9_context_tree.c

+++ b/vp9/encoder/vp9_context_tree.c

@@ -30,13 +30,13 @@

   for (i = 0; i < MAX_MB_PLANE; ++i) {

     for (k = 0; k < 3; ++k) {

       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],

-                      vpx_memalign(16, num_pix * sizeof(int16_t)));

+                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));

       CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],

-                      vpx_memalign(16, num_pix * sizeof(int16_t)));

+                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));

       CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],

-                      vpx_memalign(16, num_pix * sizeof(int16_t)));

+                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));

       CHECK_MEM_ERROR(cm, ctx->eobs[i][k],

-                      vpx_memalign(16, num_pix * sizeof(uint16_t)));

+                      vpx_memalign(16, num_pix * sizeof(*ctx->eobs[i][k])));

       ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];

       ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];

       ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];

--- a/vp9/encoder/vp9_context_tree.h

+++ b/vp9/encoder/vp9_context_tree.h

@@ -19,15 +19,15 @@

 typedef struct {

   MODE_INFO mic;

   uint8_t *zcoeff_blk;

-  int16_t *coeff[MAX_MB_PLANE][3];

-  int16_t *qcoeff[MAX_MB_PLANE][3];

-  int16_t *dqcoeff[MAX_MB_PLANE][3];

+  tran_low_t *coeff[MAX_MB_PLANE][3];

+  tran_low_t *qcoeff[MAX_MB_PLANE][3];

+  tran_low_t *dqcoeff[MAX_MB_PLANE][3];

   uint16_t *eobs[MAX_MB_PLANE][3];

   // dual buffer pointers, 0: in use, 1: best in store

-  int16_t *coeff_pbuf[MAX_MB_PLANE][3];

-  int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];

-  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];

+  tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];

+  tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];

+  tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];

   uint16_t *eobs_pbuf[MAX_MB_PLANE][3];

   int is_coded;

--- a/vp9/encoder/vp9_dct.c

+++ b/vp9/encoder/vp9_dct.c

@@ -18,15 +18,17 @@

 #include "vp9/common/vp9_idct.h"

 #include "vp9/common/vp9_systemdependent.h"

-static INLINE int fdct_round_shift(int input) {

-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

-  assert(INT16_MIN <= rv && rv <= INT16_MAX);

+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {

+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

+  // TODO(debargha, peter.derivaz): Find new bounds for this assert

+  // and make the bounds consts.

+  // assert(INT16_MIN <= rv && rv <= INT16_MAX);

   return rv;

-static void fdct4(const int16_t *input, int16_t *output) {

-  int16_t step[4];

-  int temp1, temp2;

+static void fdct4(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t step[4];

+  tran_high_t temp1, temp2;

   step[0] = input[0] + input[3];

   step[1] = input[1] + input[2];

@@ -43,9 +45,9 @@

   output[3] = fdct_round_shift(temp2);

-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {

+void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {

   int r, c;

-  int16_t sum = 0;

+  tran_low_t sum = 0;

   for (r = 0; r < 4; ++r)

     for (c = 0; c < 4; ++c)

       sum += input[r * stride + c];

@@ -54,7 +56,7 @@

   output[1] = 0;

-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {

+void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {

   // The 2D transform is done with two passes which are actually pretty

   // similar. In the first one, we transform the columns and transpose

   // the results. In the second one, we transform the rows. To achieve that,

@@ -63,22 +65,23 @@

   // in normal/row positions).

   int pass;

   // We need an intermediate buffer between passes.

-  int16_t intermediate[4 * 4];

-  const int16_t *in = input;

-  int16_t *out = intermediate;

+  tran_low_t intermediate[4 * 4];

+  const int16_t *in_pass0 = input;

+  const tran_low_t *in = NULL;

+  tran_low_t *out = intermediate;

   // Do the two transform/transpose passes

   for (pass = 0; pass < 2; ++pass) {

-    /*canbe16*/ int input[4];

-    /*canbe16*/ int step[4];

-    /*needs32*/ int temp1, temp2;

+    tran_high_t input[4];      // canbe16

+    tran_high_t step[4];       // canbe16

+    tran_high_t temp1, temp2;  // needs32

     int i;

     for (i = 0; i < 4; ++i) {

       // Load inputs.

       if (0 == pass) {

-        input[0] = in[0 * stride] * 16;

-        input[1] = in[1 * stride] * 16;

-        input[2] = in[2 * stride] * 16;

-        input[3] = in[3 * stride] * 16;

+        input[0] = in_pass0[0 * stride] * 16;

+        input[1] = in_pass0[1 * stride] * 16;

+        input[2] = in_pass0[2 * stride] * 16;

+        input[3] = in_pass0[3 * stride] * 16;

         if (i == 0 && input[0]) {

           input[0] += 1;

@@ -102,6 +105,7 @@

       out[1] = fdct_round_shift(temp1);

       out[3] = fdct_round_shift(temp2);

       // Do next column (which is a transposed row in second/horizontal pass)

+      in_pass0++;

       in++;

       out += 4;

@@ -119,9 +123,9 @@

-static void fadst4(const int16_t *input, int16_t *output) {

-  int x0, x1, x2, x3;

-  int s0, s1, s2, s3, s4, s5, s6, s7;

+static void fadst4(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t x0, x1, x2, x3;

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

   x0 = input[0];

   x1 = input[1];

@@ -166,15 +170,15 @@

   { fadst4, fadst4 }   // ADST_ADST = 3

};

-void vp9_fht4x4_c(const int16_t *input, int16_t *output,

+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,

                   int stride, int tx_type) {

   if (tx_type == DCT_DCT) {

     vp9_fdct4x4_c(input, output, stride);

   } else {

-    int16_t out[4 * 4];

-    int16_t *outptr = &out[0];

+    tran_low_t out[4 * 4];

+    tran_low_t *outptr = &out[0];

     int i, j;

-    int16_t temp_in[4], temp_out[4];

+    tran_low_t temp_in[4], temp_out[4];

     const transform_2d ht = FHT_4[tx_type];

     // Columns

@@ -199,10 +203,10 @@

-static void fdct8(const int16_t *input, int16_t *output) {

-  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

-  /*needs32*/ int t0, t1, t2, t3;

-  /*canbe16*/ int x0, x1, x2, x3;

+static void fdct8(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16

+  tran_high_t t0, t1, t2, t3;                  // needs32

+  tran_high_t x0, x1, x2, x3;                  // canbe16

   // stage 1

   s0 = input[0] + input[7];

@@ -251,9 +255,9 @@

   output[7] = fdct_round_shift(t3);

-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {

+void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {

   int r, c;

-  int16_t sum = 0;

+  tran_low_t sum = 0;

   for (r = 0; r < 8; ++r)

     for (c = 0; c < 8; ++c)

       sum += input[r * stride + c];

@@ -262,16 +266,16 @@

   output[1] = 0;

-void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {

+void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {

   int i, j;

-  int16_t intermediate[64];

+  tran_low_t intermediate[64];

   // Transform columns

-    int16_t *output = intermediate;

-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

-    /*needs32*/ int t0, t1, t2, t3;

-    /*canbe16*/ int x0, x1, x2, x3;

+    tran_low_t *output = intermediate;

+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16

+    tran_high_t t0, t1, t2, t3;                  // needs32

+    tran_high_t x0, x1, x2, x3;                  // canbe16

     int i;

     for (i = 0; i < 8; i++) {

@@ -333,9 +337,9 @@

-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {

+void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {

   int r, c;

-  int16_t sum = 0;

+  tran_low_t sum = 0;

   for (r = 0; r < 16; ++r)

     for (c = 0; c < 16; ++c)

       sum += input[r * stride + c];

@@ -344,7 +348,7 @@

   output[1] = 0;

-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {

+void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {

   // The 2D transform is done with two passes which are actually pretty

   // similar. In the first one, we transform the columns and transpose

   // the results. In the second one, we transform the rows. To achieve that,

@@ -353,37 +357,38 @@

   // in normal/row positions).

   int pass;

   // We need an intermediate buffer between passes.

-  int16_t intermediate[256];

-  const int16_t *in = input;

-  int16_t *out = intermediate;

+  tran_low_t intermediate[256];

+  const int16_t *in_pass0 = input;

+  const tran_low_t *in = NULL;

+  tran_low_t *out = intermediate;

   // Do the two transform/transpose passes

   for (pass = 0; pass < 2; ++pass) {

-    /*canbe16*/ int step1[8];

-    /*canbe16*/ int step2[8];

-    /*canbe16*/ int step3[8];

-    /*canbe16*/ int input[8];

-    /*needs32*/ int temp1, temp2;

+    tran_high_t step1[8];      // canbe16

+    tran_high_t step2[8];      // canbe16

+    tran_high_t step3[8];      // canbe16

+    tran_high_t input[8];      // canbe16

+    tran_high_t temp1, temp2;  // needs32

     int i;

     for (i = 0; i < 16; i++) {

       if (0 == pass) {

         // Calculate input for the first 8 results.

-        input[0] = (in[0 * stride] + in[15 * stride]) * 4;

-        input[1] = (in[1 * stride] + in[14 * stride]) * 4;

-        input[2] = (in[2 * stride] + in[13 * stride]) * 4;

-        input[3] = (in[3 * stride] + in[12 * stride]) * 4;

-        input[4] = (in[4 * stride] + in[11 * stride]) * 4;

-        input[5] = (in[5 * stride] + in[10 * stride]) * 4;

-        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;

-        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;

+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;

+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;

+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;

+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;

+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;

+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;

+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;

+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;

         // Calculate input for the next 8 results.

-        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;

-        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;

-        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;

-        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;

-        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;

-        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;

-        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;

-        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;

+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;

+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;

+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;

+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;

+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;

+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;

+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;

+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;

       } else {

         // Calculate input for the first 8 results.

         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);

@@ -406,9 +411,9 @@

       // Work on the first eight values; fdct8(input, even_results);

-        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

-        /*needs32*/ int t0, t1, t2, t3;

-        /*canbe16*/ int x0, x1, x2, x3;

+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16

+        tran_high_t t0, t1, t2, t3;                  // needs32

+        tran_high_t x0, x1, x2, x3;                  // canbe16

         // stage 1

         s0 = input[0] + input[7];

@@ -514,6 +519,7 @@

       // Do next column (which is a transposed row in second/horizontal pass)

       in++;

+      in_pass0++;

       out += 16;

     // Setup in/out for next pass.

@@ -522,17 +528,17 @@

-static void fadst8(const int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

+static void fadst8(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  int x0 = input[7];

-  int x1 = input[0];

-  int x2 = input[5];

-  int x3 = input[2];

-  int x4 = input[3];

-  int x5 = input[4];

-  int x6 = input[1];

-  int x7 = input[6];

+  tran_high_t x0 = input[7];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[5];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[3];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[1];

+  tran_high_t x7 = input[6];

   // stage 1

   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

@@ -600,15 +606,15 @@

   { fadst8, fadst8 }   // ADST_ADST = 3

};

-void vp9_fht8x8_c(const int16_t *input, int16_t *output,

+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,

                   int stride, int tx_type) {

   if (tx_type == DCT_DCT) {

     vp9_fdct8x8_c(input, output, stride);

   } else {

-    int16_t out[64];

-    int16_t *outptr = &out[0];

+    tran_low_t out[64];

+    tran_low_t *outptr = &out[0];

     int i, j;

-    int16_t temp_in[8], temp_out[8];

+    tran_low_t temp_in[8], temp_out[8];

     const transform_2d ht = FHT_8[tx_type];

     // Columns

@@ -633,17 +639,18 @@

 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per

    pixel. */

-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {

+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {

   int i;

-  int a1, b1, c1, d1, e1;

-  const int16_t *ip = input;

-  int16_t *op = output;

+  tran_high_t a1, b1, c1, d1, e1;

+  const int16_t *ip_pass0 = input;

+  const tran_low_t *ip = NULL;

+  tran_low_t *op = output;

   for (i = 0; i < 4; i++) {

-    a1 = ip[0 * stride];

-    b1 = ip[1 * stride];

-    c1 = ip[2 * stride];

-    d1 = ip[3 * stride];

+    a1 = ip_pass0[0 * stride];

+    b1 = ip_pass0[1 * stride];

+    c1 = ip_pass0[2 * stride];

+    d1 = ip_pass0[3 * stride];

     a1 += b1;

     d1 = d1 - c1;

@@ -657,7 +664,7 @@

     op[8] = d1;

     op[12] = b1;

-    ip++;

+    ip_pass0++;

     op++;

   ip = output;

@@ -687,12 +694,12 @@

 // Rewrote to use same algorithm as others.

-static void fdct16(const int16_t in[16], int16_t out[16]) {

-  /*canbe16*/ int step1[8];

-  /*canbe16*/ int step2[8];

-  /*canbe16*/ int step3[8];

-  /*canbe16*/ int input[8];

-  /*needs32*/ int temp1, temp2;

+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {

+  tran_high_t step1[8];      // canbe16

+  tran_high_t step2[8];      // canbe16

+  tran_high_t step3[8];      // canbe16

+  tran_high_t input[8];      // canbe16

+  tran_high_t temp1, temp2;  // needs32

   // step 1

   input[0] = in[0] + in[15];

@@ -715,9 +722,9 @@

   // fdct8(step, step);

-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

-    /*needs32*/ int t0, t1, t2, t3;

-    /*canbe16*/ int x0, x1, x2, x3;

+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16

+    tran_high_t t0, t1, t2, t3;                  // needs32

+    tran_high_t x0, x1, x2, x3;                  // canbe16

     // stage 1

     s0 = input[0] + input[7];

@@ -828,25 +835,26 @@

   out[15] = fdct_round_shift(temp2);

-static void fadst16(const int16_t *input, int16_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

+static void fadst16(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

+  tran_high_t s9, s10, s11, s12, s13, s14, s15;

-  int x0 = input[15];

-  int x1 = input[0];

-  int x2 = input[13];

-  int x3 = input[2];

-  int x4 = input[11];

-  int x5 = input[4];

-  int x6 = input[9];

-  int x7 = input[6];

-  int x8 = input[7];

-  int x9 = input[8];

-  int x10 = input[5];

-  int x11 = input[10];

-  int x12 = input[3];

-  int x13 = input[12];

-  int x14 = input[1];

-  int x15 = input[14];

+  tran_high_t x0 = input[15];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[13];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[11];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[9];

+  tran_high_t x7 = input[6];

+  tran_high_t x8 = input[7];

+  tran_high_t x9 = input[8];

+  tran_high_t x10 = input[5];

+  tran_high_t x11 = input[10];

+  tran_high_t x12 = input[3];

+  tran_high_t x13 = input[12];

+  tran_high_t x14 = input[1];

+  tran_high_t x15 = input[14];

   // stage 1

   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

@@ -997,15 +1005,15 @@

   { fadst16, fadst16 }   // ADST_ADST = 3

};

-void vp9_fht16x16_c(const int16_t *input, int16_t *output,

+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,

                     int stride, int tx_type) {

   if (tx_type == DCT_DCT) {

     vp9_fdct16x16_c(input, output, stride);

   } else {

-    int16_t out[256];

-    int16_t *outptr = &out[0];

+    tran_low_t out[256];

+    tran_low_t *outptr = &out[0];

     int i, j;

-    int16_t temp_in[16], temp_out[16];

+    tran_low_t temp_in[16], temp_out[16];

     const transform_2d ht = FHT_16[tx_type];

     // Columns

@@ -1028,19 +1036,21 @@

-static INLINE int dct_32_round(int input) {

-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

-  assert(-131072 <= rv && rv <= 131071);

+static INLINE tran_high_t dct_32_round(tran_high_t input) {

+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

+  // TODO(debargha, peter.derivaz): Find new bounds for this assert,

+  // and make the bounds consts.

+  // assert(-131072 <= rv && rv <= 131071);

   return rv;

-static INLINE int half_round_shift(int input) {

-  int rv = (input + 1 + (input < 0)) >> 2;

+static INLINE tran_high_t half_round_shift(tran_high_t input) {

+  tran_high_t rv = (input + 1 + (input < 0)) >> 2;

   return rv;

-static void fdct32(const int *input, int *output, int round) {

-  int step[32];

+static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {

+  tran_high_t step[32];

   // Stage 1

   step[0] = input[0] + input[(32 - 1)];

   step[1] = input[1] + input[(32 - 2)];

@@ -1362,9 +1372,9 @@

   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);

-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {

+void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {

   int r, c;

-  int16_t sum = 0;

+  tran_low_t sum = 0;

   for (r = 0; r < 32; ++r)

     for (c = 0; c < 32; ++c)

       sum += input[r * stride + c];

@@ -1373,13 +1383,13 @@

   output[1] = 0;

-void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {

+void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {

   int i, j;

-  int output[32 * 32];

+  tran_high_t output[32 * 32];

   // Columns

   for (i = 0; i < 32; ++i) {

-    int temp_in[32], temp_out[32];

+    tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = input[j * stride + i] * 4;

     fdct32(temp_in, temp_out, 0);

@@ -1389,7 +1399,7 @@

   // Rows

   for (i = 0; i < 32; ++i) {

-    int temp_in[32], temp_out[32];

+    tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = output[j + i * 32];

     fdct32(temp_in, temp_out, 0);

@@ -1401,13 +1411,13 @@

 // Note that although we use dct_32_round in dct32 computation flow,

 // this 2d fdct32x32 for rate-distortion optimization loop is operating

 // within 16 bits precision.

-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {

+void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {

   int i, j;

-  int output[32 * 32];

+  tran_high_t output[32 * 32];

   // Columns

   for (i = 0; i < 32; ++i) {

-    int temp_in[32], temp_out[32];

+    tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = input[j * stride + i] * 4;

     fdct32(temp_in, temp_out, 0);

@@ -1420,7 +1430,7 @@

   // Rows

   for (i = 0; i < 32; ++i) {

-    int temp_in[32], temp_out[32];

+    tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = output[j + i * 32];

     fdct32(temp_in, temp_out, 1);

@@ -1428,3 +1438,61 @@

       out[j + i * 32] = temp_out[j];

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {

+  vp9_fdct4x4_c(input, output, stride);

+}

+void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,

+                       int stride, int tx_type) {

+  vp9_fht4x4_c(input, output, stride, tx_type);

+}

+void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,

+                          int stride) {

+  vp9_fdct8x8_1_c(input, final_output, stride);

+}

+void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,

+                        int stride) {

+  vp9_fdct8x8_c(input, final_output, stride);

+}

+void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,

+                            int stride) {

+  vp9_fdct16x16_1_c(input, output, stride);

+}

+void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,

+                          int stride) {

+  vp9_fdct16x16_c(input, output, stride);

+}

+void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,

+                  int stride, int tx_type) {

+  vp9_fht8x8_c(input, output, stride, tx_type);

+}

+void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {

+  vp9_fwht4x4_c(input, output, stride);

+}

+void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,

+                    int stride, int tx_type) {

+  vp9_fht16x16_c(input, output, stride, tx_type);

+}

+void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {

+  vp9_fdct32x32_1_c(input, out, stride);

+}

+void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {

+  vp9_fdct32x32_c(input, out, stride);

+}

+void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,

+                             int stride) {

+  vp9_fdct32x32_rd_c(input, out, stride);

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -107,9 +107,9 @@

   vp9_token_state tokens[1025][2];

   unsigned best_index[1025][2];

   uint8_t token_cache[1024];

-  const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);

-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);

+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   const int eob = p->eobs[block];

   const PLANE_TYPE type = pd->plane_type;

   const int default_eob = 16 << (tx_size << 1);

@@ -294,7 +294,8 @@

 static INLINE void fdct32x32(int rd_transform,

-                             const int16_t *src, int16_t *dst, int src_stride) {

+                             const int16_t *src, tran_low_t *dst,

+                             int src_stride) {

   if (rd_transform)

     vp9_fdct32x32_rd(src, dst, src_stride);

   else

@@ -301,6 +302,16 @@

     vp9_fdct32x32(src, dst, src_stride);

+#if CONFIG_VP9_HIGHBITDEPTH

+static INLINE void high_fdct32x32(int rd_transform, const int16_t *src,

+                                  tran_low_t *dst, int src_stride) {

+  if (rd_transform)

+    vp9_high_fdct32x32_rd(src, dst, src_stride);

+  else

+    vp9_high_fdct32x32(src, dst, src_stride);

+}

+#endif

 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,

                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {

   MACROBLOCKD *const xd = &x->e_mbd;

@@ -307,9 +318,9 @@

   const struct macroblock_plane *const p = &x->plane[plane];

   const struct macroblockd_plane *const pd = &xd->plane[plane];

   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];

-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);

-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);

+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   uint16_t *const eob = &p->eobs[block];

   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];

   int i, j;

@@ -357,9 +368,9 @@

   MACROBLOCKD *const xd = &x->e_mbd;

   const struct macroblock_plane *const p = &x->plane[plane];

   const struct macroblockd_plane *const pd = &xd->plane[plane];

-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);

-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);

+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   uint16_t *const eob = &p->eobs[block];

   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];

   int i, j;

@@ -405,9 +416,9 @@

   const struct macroblock_plane *const p = &x->plane[plane];

   const struct macroblockd_plane *const pd = &xd->plane[plane];

   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];

-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);

-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);

+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   uint16_t *const eob = &p->eobs[block];

   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];

   int i, j;

@@ -458,7 +469,7 @@

   struct optimize_ctx *const ctx = args->ctx;

   struct macroblock_plane *const p = &x->plane[plane];

   struct macroblockd_plane *const pd = &xd->plane[plane];

-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   int i, j;

   uint8_t *dst;

   ENTROPY_CONTEXT *a, *l;

@@ -538,7 +549,7 @@

   MACROBLOCKD *const xd = &x->e_mbd;

   struct macroblock_plane *const p = &x->plane[plane];

   struct macroblockd_plane *const pd = &xd->plane[plane];

-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   int i, j;

   uint8_t *dst;

   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);

@@ -587,9 +598,9 @@

   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;

   struct macroblock_plane *const p = &x->plane[plane];

   struct macroblockd_plane *const pd = &xd->plane[plane];

-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);

-  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);

-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);

+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   const scan_order *scan_order;

   TX_TYPE tx_type;

   PREDICTION_MODE mode;

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -556,6 +556,9 @@

   cm->profile = oxcf->profile;

   cm->bit_depth = oxcf->bit_depth;

+#if CONFIG_VP9_HIGHBITDEPTH

+  cm->use_highbitdepth = oxcf->use_highbitdepth;

+#endif

   cm->color_space = UNKNOWN;

   cm->width = oxcf->width;

@@ -613,6 +616,11 @@

     assert(cm->bit_depth > VPX_BITS_8);

   cpi->oxcf = *oxcf;

+#if CONFIG_VP9_HIGHBITDEPTH

+  if (cpi->oxcf.use_highbitdepth) {

+    cpi->mb.e_mbd.bd = (int)cm->bit_depth;

+  }

+#endif

   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;

@@ -2768,7 +2776,16 @@

   if (oxcf->pass == 1 &&

       (!cpi->use_svc || is_two_pass_svc(cpi))) {

     const int lossless = is_lossless_requested(oxcf);

+#if CONFIG_VP9_HIGHBITDEPTH

+    if (cpi->oxcf.use_highbitdepth)

+      cpi->mb.fwd_txm4x4 = lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;

+    else

+      cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;

+    cpi->mb.high_itxm_add = lossless ? vp9_high_iwht4x4_add :

+                                       vp9_high_idct4x4_add;

+#else

     cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;

+#endif

     cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;

     vp9_first_pass(cpi, source);

   } else if (oxcf->pass == 2 &&

--- a/vp9/encoder/vp9_encoder.h

+++ b/vp9/encoder/vp9_encoder.h

@@ -217,6 +217,9 @@

   vp8e_tuning tuning;

   vp9e_tune_content content;

+#if CONFIG_VP9_HIGHBITDEPTH

+  int use_highbitdepth;

+#endif

 } VP9EncoderConfig;

 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -19,9 +19,9 @@

 #include "vp9/encoder/vp9_quantize.h"

 #include "vp9/encoder/vp9_rd.h"

-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,

+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,

                      const int16_t *round_ptr, const int16_t quant,

-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                      const int16_t dequant_ptr, uint16_t *eob_ptr) {

   const int rc = 0;

   const int coeff = coeff_ptr[rc];

@@ -40,9 +40,9 @@

   *eob_ptr = eob + 1;

-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,

+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

                            const int16_t *round_ptr, const int16_t quant,

-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                            const int16_t dequant_ptr, uint16_t *eob_ptr) {

   const int rc = 0;

   const int coeff = coeff_ptr[rc];

@@ -62,11 +62,11 @@

   *eob_ptr = eob + 1;

-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,

+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

                        int skip_block,

                        const int16_t *zbin_ptr, const int16_t *round_ptr,

                        const int16_t *quant_ptr, const int16_t *quant_shift_ptr,

-                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                        const int16_t *dequant_ptr,

                        int zbin_oq_value, uint16_t *eob_ptr,

                        const int16_t *scan, const int16_t *iscan) {

@@ -78,13 +78,13 @@

   (void)zbin_oq_value;

   (void)iscan;

-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));

-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));

+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

   if (!skip_block) {

     // Quantization pass: All coefficients with index >= zero_flag are

     // skippable. Note: zero_flag can be zero.

-    for (i = 0; i < count; i++) {

+    for (i = 0; i < n_coeffs; i++) {

       const int rc = scan[i];

       const int coeff = coeff_ptr[rc];

       const int coeff_sign = (coeff >> 31);

@@ -105,12 +105,12 @@

 // TODO(jingning) Refactor this file and combine functions with similar

 // operations.

-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,

+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

                              int skip_block,

                              const int16_t *zbin_ptr, const int16_t *round_ptr,

                              const int16_t *quant_ptr,

                              const int16_t *quant_shift_ptr,

-                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                              const int16_t *dequant_ptr,

                              int zbin_oq_value, uint16_t *eob_ptr,

                              const int16_t *scan, const int16_t *iscan) {

@@ -120,8 +120,8 @@

   (void)zbin_oq_value;

   (void)iscan;

-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));

-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));

+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

   if (!skip_block) {

     for (i = 0; i < n_coeffs; i++) {

@@ -146,15 +146,15 @@

   *eob_ptr = eob + 1;

-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,

+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

                       int skip_block,

                       const int16_t *zbin_ptr, const int16_t *round_ptr,

                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,

-                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                       const int16_t *dequant_ptr,

                       int zbin_oq_value, uint16_t *eob_ptr,

                       const int16_t *scan, const int16_t *iscan) {

-  int i, non_zero_count = (int)count, eob = -1;

+  int i, non_zero_count = (int)n_coeffs, eob = -1;

   const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,

                          zbin_ptr[1] + zbin_oq_value };

   const int nzbins[2] = { zbins[0] * -1,

@@ -161,12 +161,12 @@

                           zbins[1] * -1 };

   (void)iscan;

-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));

-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));

+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

   if (!skip_block) {

     // Pre-scan pass

-    for (i = (int)count - 1; i >= 0; i--) {

+    for (i = (int)n_coeffs - 1; i >= 0; i--) {

       const int rc = scan[i];

       const int coeff = coeff_ptr[rc];

@@ -199,12 +199,12 @@

   *eob_ptr = eob + 1;

-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,

+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

                             int skip_block,

                             const int16_t *zbin_ptr, const int16_t *round_ptr,

                             const int16_t *quant_ptr,

                             const int16_t *quant_shift_ptr,

-                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                             const int16_t *dequant_ptr,

                             int zbin_oq_value, uint16_t *eob_ptr,

                             const int16_t *scan, const int16_t *iscan) {

@@ -217,8 +217,8 @@

   int i, eob = -1;

   (void)iscan;

-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));

-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));

+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

   if (!skip_block) {

     // Pre-scan pass

@@ -280,6 +280,12 @@

   *shift = 1 << (16 - l);

+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {

+  int quant = vp9_dc_quant(q, 0);

+  (void) bit_depth;

+  return q == 0 ? 64 : (quant < 148 ? 84 : 80);

+}

 void vp9_init_quantizer(VP9_COMP *cpi) {

   VP9_COMMON *const cm = &cpi->common;

   QUANTS *const quants = &cpi->quants;

@@ -286,7 +292,7 @@

   int i, q, quant;

   for (q = 0; q < QINDEX_RANGE; q++) {

-    const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);

+    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);

     const int qrounding_factor = q == 0 ? 64 : 48;

     for (i = 0; i < 2; ++i) {

--- a/vp9/encoder/vp9_quantize.h

+++ b/vp9/encoder/vp9_quantize.h

@@ -37,16 +37,28 @@

   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);

 } QUANTS;

-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,

+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,

                      const int16_t *round_ptr, const int16_t quant_ptr,

-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                      const int16_t dequant_ptr, uint16_t *eob_ptr);

-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,

+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

                            const int16_t *round_ptr, const int16_t quant_ptr,

-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                            const int16_t dequant_ptr, uint16_t *eob_ptr);

 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,

                                 const int16_t *scan, const int16_t *iscan);

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,

+                          const int16_t *round_ptr, const int16_t quant_ptr,

+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                          const int16_t dequant_ptr, uint16_t *eob_ptr);

+void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

+                                const int16_t *round_ptr,

+                                const int16_t quant_ptr, tran_low_t *qcoeff_ptr,

+                                tran_low_t *dqcoeff_ptr,

+                                const int16_t dequant_ptr, uint16_t *eob_ptr);

+#endif

 struct VP9_COMP;

 struct VP9Common;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -249,7 +249,7 @@

   *out_dist_sum = dist_sum << 4;

-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,

+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,

                           intptr_t block_size, int64_t *ssz) {

   int i;

   int64_t error = 0, sqcoeff = 0;

@@ -288,7 +288,7 @@

   const PLANE_TYPE type = pd->plane_type;

   const int16_t *band_count = &band_counts[tx_size][1];

   const int eob = p->eobs[block];

-  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =

                    x->token_costs[tx_size][type][is_inter_block(mbmi)];

   uint8_t token_cache[32 * 32];

@@ -358,8 +358,8 @@

   const struct macroblockd_plane *const pd = &xd->plane[plane];

   int64_t this_sse;

   int shift = tx_size == TX_32X32 ? 0 : 2;

-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);

-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);

+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,

                                &this_sse) >> shift;

   args->sse  = this_sse >> shift;

@@ -405,8 +405,8 @@

       dist_block(plane, block, tx_size, args);

     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {

       // compute DC coefficient

-      int16_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);

-      int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);

+      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);

+      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);

       vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);

       args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;

       args->dist = args->sse;

@@ -690,7 +690,7 @@

         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];

         int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,

                                                             p->src_diff);

-        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);

+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);

         xd->mi[0]->bmi[block].as_mode = mode;

         vp9_predict_intra_block(xd, block, 1,

                                 TX_4X4, mode,

@@ -1137,7 +1137,7 @@

   for (idy = 0; idy < height / 4; ++idy) {

     for (idx = 0; idx < width / 4; ++idx) {

       int64_t ssz, rd, rd1, rd2;

-      int16_t* coeff;

+      tran_low_t* coeff;

       k += (idy * 2 + idx);

       coeff = BLOCK_OFFSET(p->coeff, k);

--- a/vp9/encoder/vp9_tokenize.c

+++ b/vp9/encoder/vp9_tokenize.c

@@ -212,7 +212,7 @@

   TOKENEXTRA *t = *tp;        /* store tokens starting here */

   int eob = p->eobs[block];

   const PLANE_TYPE type = pd->plane_type;

-  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);

   const int segment_id = mbmi->segment_id;

   const int16_t *scan, *nb;

   const scan_order *so;

--- a/vp9/vp9_cx_iface.c

+++ b/vp9/vp9_cx_iface.c

@@ -686,6 +686,10 @@

     if (res == VPX_CODEC_OK) {

       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);

+#if CONFIG_VP9_HIGHBITDEPTH

+      priv->oxcf.use_highbitdepth =

+          (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;

+#endif

       priv->cpi = vp9_create_compressor(&priv->oxcf);

       if (priv->cpi == NULL)

         res = VPX_CODEC_MEM_ERROR;

@@ -1333,6 +1337,9 @@

 CODEC_INTERFACE(vpx_codec_vp9_cx) = {

   "WebM Project VP9 Encoder" VERSION_STRING,

   VPX_CODEC_INTERNAL_ABI_VERSION,

+#if CONFIG_VP9_HIGHBITDEPTH

+  VPX_CODEC_CAP_HIGHBITDEPTH |

+#endif

   VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,  // vpx_codec_caps_t

   encoder_init,       // vpx_codec_init_fn_t

   encoder_destroy,    // vpx_codec_destroy_fn_t