shithub: libvpx

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -255,11 +255,11 @@

 #if CONFIG_VP9_HIGHBITDEPTH

 void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);

+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);

+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);

 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,

@@ -273,36 +273,36 @@

 void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);

+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);

 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);

+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);

 #if HAVE_SSE2

 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);

+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);

+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);

 void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);

+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);

+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);

 void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);

+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);

+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);

 #endif  // HAVE_SSE2

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -353,7 +353,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -475,10 +475,10 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

-        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,

+        inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_,

                      tx_type_);

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

       if (bit_depth_ == VPX_BITS_8) {

@@ -530,8 +530,7 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

-        ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16));

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -585,9 +584,9 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));

       } else {

 #if CONFIG_VP9_HIGHBITDEPTH

-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);

+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/test/dct32x32_test.cc

+++ b/test/dct32x32_test.cc

@@ -71,11 +71,11 @@

 #if CONFIG_VP9_HIGHBITDEPTH

 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);

+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);

+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -137,7 +137,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

     } else {

       ASM_REGISTER_STATE_CHECK(

-          inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32));

+          inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32));

 #endif

@@ -275,7 +275,7 @@

       ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));

 #if CONFIG_VP9_HIGHBITDEPTH

     } else {

-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));

+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32));

 #endif

     for (int j = 0; j < kNumCoeffs; ++j) {

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -55,36 +55,36 @@

 #if CONFIG_VP9_HIGHBITDEPTH

 void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);

+  vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);

+  vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);

 void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);

+  vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);

 void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);

+  vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);

 void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);

+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);

 void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);

+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);

 #if HAVE_SSE2

 void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);

+  vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);

+  vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);

 #endif  // HAVE_SSE2

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -135,7 +135,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -249,7 +249,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -88,45 +88,45 @@

 #if CONFIG_VP9_HIGHBITDEPTH

 void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);

+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);

+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);

 void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);

+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);

 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {

-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);

+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);

 #if HAVE_SSE2

 void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);

+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);

+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);

 void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);

+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);

+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);

 void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);

+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);

 void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {

-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);

+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);

 #endif  // HAVE_SSE2

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -257,7 +257,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -340,7 +340,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -413,7 +413,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -497,9 +497,9 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);

+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

--- a/test/partial_idct_test.cc

+++ b/test/partial_idct_test.cc

@@ -43,9 +43,11 @@

 #if CONFIG_VP9_HIGHBITDEPTH

-template <InvTxfmWithBdFunc fn>

+typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out,

+                                  int stride, int bd);

+template <InvTxfmHighbdFunc fn>

 void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {

-  fn(in, CONVERT_TO_BYTEPTR(out), stride, bd);

+  fn(in, CAST_TO_SHORTPTR(out), stride, bd);

 #endif

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -205,7 +205,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,

                                 int stride, int tx_type, int bd) {

   const highbd_transform_2d IHT_4[] = {

     { vpx_highbd_idct4_c, vpx_highbd_idct4_c },   // DCT_DCT  = 0

@@ -213,7 +213,6 @@

     { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },  // DCT_ADST = 2

     { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }  // ADST_ADST = 3

};

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   int i, j;

   tran_low_t out[4 * 4];

@@ -245,7 +244,7 @@

   { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }  // ADST_ADST = 3

};

-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,

                                 int stride, int tx_type, int bd) {

   int i, j;

   tran_low_t out[8 * 8];

@@ -252,7 +251,6 @@

   tran_low_t *outptr = out;

   tran_low_t temp_in[8], temp_out[8];

   const highbd_transform_2d ht = HIGH_IHT_8[tx_type];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // Inverse transform row vectors.

   for (i = 0; i < 8; ++i) {

@@ -279,7 +277,7 @@

   { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }  // ADST_ADST = 3

};

-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,

                                    int stride, int tx_type, int bd) {

   int i, j;

   tran_low_t out[16 * 16];

@@ -286,7 +284,6 @@

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

   const highbd_transform_2d ht = HIGH_IHT_16[tx_type];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // Rows

   for (i = 0; i < 16; ++i) {

@@ -307,7 +304,7 @@

 // idct

-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,

                             int eob, int bd) {

   if (eob > 1)

     vpx_highbd_idct4x4_16_add(input, dest, stride, bd);

@@ -315,7 +312,7 @@

     vpx_highbd_idct4x4_1_add(input, dest, stride, bd);

-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,

                             int eob, int bd) {

   if (eob > 1)

     vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);

@@ -323,7 +320,7 @@

     vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);

-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,

+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,

                             int eob, int bd) {

   // If dc is 1, then input[0] is the reconstructed value, do not need

   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

@@ -340,7 +337,7 @@

-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,

+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,

                               int stride, int eob, int bd) {

   // The calculation can be simplified if there are not many non-zero dct

   // coefficients. Use eobs to separate different cases.

@@ -356,7 +353,7 @@

-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,

+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,

                               int stride, int eob, int bd) {

   // Non-zero coeff only in upper-left 8x8

   if (eob == 1) {

@@ -372,7 +369,7 @@

 // iht

 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,

-                           uint8_t *dest, int stride, int eob, int bd) {

+                           uint16_t *dest, int stride, int eob, int bd) {

   if (tx_type == DCT_DCT)

     vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);

   else

@@ -380,7 +377,7 @@

 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,

-                           uint8_t *dest, int stride, int eob, int bd) {

+                           uint16_t *dest, int stride, int eob, int bd) {

   if (tx_type == DCT_DCT) {

     vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);

   } else {

@@ -389,7 +386,7 @@

 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

-                             uint8_t *dest, int stride, int eob, int bd) {

+                             uint16_t *dest, int stride, int eob, int bd) {

   if (tx_type == DCT_DCT) {

     vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);

   } else {

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -57,22 +57,22 @@

                       int stride, int eob);

 #if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,

                             int eob, int bd);

-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,

                             int eob, int bd);

-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,

+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,

                             int eob, int bd);

-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,

+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,

                               int stride, int eob, int bd);

-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,

+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,

                               int stride, int eob, int bd);

 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,

-                           uint8_t *dest, int stride, int eob, int bd);

+                           uint16_t *dest, int stride, int eob, int bd);

 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,

-                           uint8_t *dest, int stride, int eob, int bd);

+                           uint16_t *dest, int stride, int eob, int bd);

 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

-                             uint8_t *dest, int stride, int eob, int bd);

+                             uint16_t *dest, int stride, int eob, int bd);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #ifdef __cplusplus

 }  // extern "C"

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -101,11 +101,11 @@

   # Note as optimized versions of these functions are added we need to add a check to ensure

   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.

-  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";

+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";

-  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";

+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";

-  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";

+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -189,21 +189,22 @@

   assert(eob > 0);

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);

     if (xd->lossless) {

-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);

+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);

     } else {

       switch (tx_size) {

         case TX_4X4:

-          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_8X8:

-          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_16X16:

-          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_32X32:

-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         default: assert(0 && "Invalid transform size");

@@ -256,21 +257,22 @@

   assert(eob > 0);

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);

     if (xd->lossless) {

-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);

+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);

     } else {

       switch (tx_size) {

         case TX_4X4:

-          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_8X8:

-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_16X16:

-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_32X32:

-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         default: assert(0 && "Invalid transform size");

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -184,7 +184,7 @@

   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);

   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);

 #if CONFIG_VP9_HIGHBITDEPTH

-  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,

+  void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride,

                           int eob, int bd);

 #endif

};

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -637,17 +637,18 @@

   if (x->skip_encode || p->eobs[block] == 0) return;

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);

     switch (tx_size) {

       case TX_32X32:

-        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                                  xd->bd);

         break;

       case TX_16X16:

-        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                                  xd->bd);

         break;

       case TX_8X8:

-        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                                xd->bd);

         break;

       case TX_4X4:

@@ -654,7 +655,7 @@

         // this is like vp9_short_idct4x4 but has a special case around eob<=1

         // which is significant (not just an optimization) for the lossless

         // case.

-        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                            xd->bd);

         break;

       default: assert(0 && "Invalid transform size");

@@ -699,7 +700,8 @@

   if (p->eobs[block] > 0) {

 #if CONFIG_VP9_HIGHBITDEPTH

     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-      x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);

+      x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,

+                         p->eobs[block], xd->bd);

       return;

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -799,6 +801,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);

     switch (tx_size) {

       case TX_32X32:

         if (!x->skip_recode) {

@@ -814,7 +817,7 @@

           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;

         if (!x->skip_encode && *eob) {

-          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);

+          vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);

         break;

       case TX_16X16:

@@ -834,7 +837,7 @@

           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;

         if (!x->skip_encode && *eob) {

-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob,

+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob,

                                   xd->bd);

         break;

@@ -855,7 +858,7 @@

           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;

         if (!x->skip_encode && *eob) {

-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,

+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob,

                                 xd->bd);

         break;

@@ -880,9 +883,10 @@

             // this is like vp9_short_idct4x4 but has a special case around

             // eob<=1 which is significant (not just an optimization) for the

             // lossless case.

-            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);

+            x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);

           } else {

-            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);

+            vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,

+                                     xd->bd);

         break;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -601,26 +601,26 @@

       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

         vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,

                                  32, NULL, 0, NULL, 0, bs, bs, xd->bd);

-        recon = CONVERT_TO_BYTEPTR(recon16);

         if (xd->lossless) {

-          vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);

+          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);

         } else {

           switch (tx_size) {

             case TX_4X4:

-              vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);

+              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);

               break;

             case TX_8X8:

-              vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);

+              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);

               break;

             case TX_16X16:

-              vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);

+              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);

               break;

             case TX_32X32:

-              vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);

+              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);

               break;

             default: assert(0 && "Invalid transform size");

+        recon = CONVERT_TO_BYTEPTR(recon16);

       } else {

 #endif  // CONFIG_VP9_HIGHBITDEPTH

         vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);

@@ -1004,6 +1004,7 @@

           const int block = (row + idy) * 2 + (col + idx);

           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];

           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];

+          uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);

           int16_t *const src_diff =

               vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);

           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);

@@ -1025,7 +1026,7 @@

             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);

             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)

               goto next_highbd;

-            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst,

+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16,

                                    dst_stride, p->eobs[block], xd->bd);

           } else {

             int64_t unused;

@@ -1048,7 +1049,7 @@

             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)

               goto next_highbd;

             vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),

-                                  dst, dst_stride, p->eobs[block], xd->bd);

+                                  dst16, dst_stride, p->eobs[block], xd->bd);

--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c

@@ -1268,10 +1268,8 @@

-void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,

                                        int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t row_idct_output[16 * 16];

@@ -1313,10 +1311,8 @@

-void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,

                                       int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t row_idct_output[16 * 16];

@@ -1349,10 +1345,8 @@

-void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,

                                       int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t row_idct_output[4 * 16];

@@ -1414,7 +1408,7 @@

   *dest += stride;

-void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,

                                      int stride, int bd) {

   const tran_low_t out0 =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

@@ -1422,7 +1416,6 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   int i;

   if (a1 >= 0) {

--- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c

@@ -386,8 +386,8 @@

 static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,

-                                             uint8_t *const dest,

-                                             const int stride, const int bd) {

+                                             uint16_t *dst, const int stride,

+                                             const int bd) {

   int i, idct32_pass_loop;

   int32_t trans_buf[32 * 8];

   int32_t pass1[32 * 32];

@@ -394,7 +394,6 @@

   int32_t pass2[32 * 32];

   int32_t *out;

   int32x4x2_t q[16];

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);

   for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;

        idct32_pass_loop++, input = pass1, out = pass2) {

@@ -637,10 +636,10 @@

-void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,

+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,

                                         int stride, int bd) {

   if (bd == 8) {

-    vpx_idct32_32_neon(input, dest, stride, 1);

+    vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);

   } else {

     vpx_highbd_idct32_32_neon(input, dest, stride, bd);

--- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c

@@ -726,10 +726,9 @@

   highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);

-void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,

                                        int stride, int bd) {

   int i;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t temp[32 * 16];

--- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c

@@ -594,10 +594,9 @@

   highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);

-void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,

                                       int stride, int bd) {

   int i;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t temp[32 * 8];

--- a/vpx_dsp/arm/highbd_idct32x32_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c

@@ -59,7 +59,7 @@

   *dest += stride;

-void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,

                                      int stride, int bd) {

   const tran_low_t out0 =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

@@ -67,7 +67,6 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   int i;

   if (a1 >= 0) {

--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c

@@ -51,7 +51,7 @@

   *dest += stride;

-void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,

                                    int stride, int bd) {

   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

   const tran_low_t out0 =

@@ -60,7 +60,6 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);

   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);

@@ -133,7 +132,7 @@

   *a3 = vsubq_s32(b0, b3);

-void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,

                                     int stride, int bd) {

   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

   int32x4_t c0 = vld1q_s32(input);

@@ -140,7 +139,6 @@

   int32x4_t c1 = vld1q_s32(input + 4);

   int32x4_t c2 = vld1q_s32(input + 8);

   int32x4_t c3 = vld1q_s32(input + 12);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   int16x8_t a0, a1;

   if (bd == 8) {

--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c

@@ -36,7 +36,7 @@

   *dest += stride;

-void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,

                                    int stride, int bd) {

   const tran_low_t out0 =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

@@ -44,7 +44,6 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   if (a1 >= 0) {

     const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

@@ -292,9 +291,8 @@

   vst1q_u16(dest, d7_u16);

-void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,

                                     int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   int32x4_t a0 = vld1q_s32(input);

   int32x4_t a1 = vld1q_s32(input + 8);

   int32x4_t a2 = vld1q_s32(input + 16);

@@ -553,9 +551,8 @@

   *io7 = vsubq_s32(step1[0], step2[7]);

-void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,

                                     int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   int32x4_t a0 = vld1q_s32(input);

   int32x4_t a1 = vld1q_s32(input + 4);

   int32x4_t a2 = vld1q_s32(input + 8);

--- a/vpx_dsp/arm/idct32x32_add_neon.c

+++ b/vpx_dsp/arm/idct32x32_add_neon.c

@@ -517,7 +517,7 @@

   const int16_t *input_pass2 = pass1;  // input of pass2 is the result of pass1

   int16_t *out;

   int16x8_t q[16];

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);

+  uint16_t *dst = CAST_TO_SHORTPTR(dest);

   for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;

        idct32_pass_loop++, out = pass2) {

--- a/vpx_dsp/inv_txfm.c

+++ b/vpx_dsp/inv_txfm.c

@@ -1290,7 +1290,7 @@

   return 0;

-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,

                                  int stride, int bd) {

   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

      0.5 shifts per pixel. */

@@ -1299,7 +1299,6 @@

   tran_high_t a1, b1, c1, d1, e1;

   const tran_low_t *ip = input;

   tran_low_t *op = output;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   for (i = 0; i < 4; i++) {

     a1 = ip[0] >> UNIT_QUANT_SHIFT;

@@ -1348,7 +1347,7 @@

-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,

+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,

                                 int stride, int bd) {

   int i;

   tran_high_t a1, e1;

@@ -1355,7 +1354,6 @@

   tran_low_t tmp[4];

   const tran_low_t *ip = in;

   tran_low_t *op = tmp;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   (void)bd;

   a1 = ip[0] >> UNIT_QUANT_SHIFT;

@@ -1452,13 +1450,12 @@

   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);

-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,

                                  int stride, int bd) {

   int i, j;

   tran_low_t out[4 * 4];

   tran_low_t *outptr = out;

   tran_low_t temp_in[4], temp_out[4];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // Rows

   for (i = 0; i < 4; ++i) {

@@ -1478,13 +1475,12 @@

-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,

                                 int stride, int bd) {

   int i;

   tran_high_t a1;

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

   a1 = ROUND_POWER_OF_TWO(out, 4);

@@ -1636,13 +1632,12 @@

   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);

-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,

                                  int stride, int bd) {

   int i, j;

   tran_low_t out[8 * 8];

   tran_low_t *outptr = out;

   tran_low_t temp_in[8], temp_out[8];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // First transform rows

   for (i = 0; i < 8; ++i) {

@@ -1662,13 +1657,12 @@

-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,

                                  int stride, int bd) {

   int i, j;

   tran_low_t out[8 * 8] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[8], temp_out[8];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // First transform rows

   // Only first 4 row has non-zero coefs

@@ -1689,13 +1683,12 @@

-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,

                                 int stride, int bd) {

   int i, j;

   tran_high_t a1;

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

   a1 = ROUND_POWER_OF_TWO(out, 5);

@@ -2056,13 +2049,12 @@

   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);

-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,

                                     int stride, int bd) {

   int i, j;

   tran_low_t out[16 * 16];

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // First transform rows

   for (i = 0; i < 16; ++i) {

@@ -2082,13 +2074,12 @@

-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,

                                    int stride, int bd) {

   int i, j;

   tran_low_t out[16 * 16] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

-  uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);

   // First transform rows. Since all non-zero dct coefficients are in

   // upper-left 8x8 area, we only need to calculate first 8 rows here.

@@ -2111,13 +2102,12 @@

-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,

                                    int stride, int bd) {

   int i, j;

   tran_low_t out[16 * 16] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // First transform rows. Since all non-zero dct coefficients are in

   // upper-left 4x4 area, we only need to calculate first 4 rows here.

@@ -2138,13 +2128,12 @@

-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,

                                   int stride, int bd) {

   int i, j;

   tran_high_t a1;

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

   a1 = ROUND_POWER_OF_TWO(out, 6);

@@ -2531,13 +2520,12 @@

   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);

-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,

                                      int stride, int bd) {

   int i, j;

   tran_low_t out[32 * 32];

   tran_low_t *outptr = out;

   tran_low_t temp_in[32], temp_out[32];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // Rows

   for (i = 0; i < 32; ++i) {

@@ -2569,13 +2557,12 @@

-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,

                                     int stride, int bd) {

   int i, j;

   tran_low_t out[32 * 32] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[32], temp_out[32];

-  uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);

   // Rows

   // Only upper-left 16x16 has non-zero coeff

@@ -2598,13 +2585,12 @@

-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,

                                    int stride, int bd) {

   int i, j;

   tran_low_t out[32 * 32] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[32], temp_out[32];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   // Rows

   // Only upper-left 8x8 has non-zero coeff

@@ -2625,11 +2611,10 @@

-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,

+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,

                                   int stride, int bd) {

   int i, j;

   int a1;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -231,6 +231,11 @@

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_34_add_neon.c

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_135_add_neon.c

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_1024_add_neon.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_inv_txfm_sse2.h

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c

 endif  # !CONFIG_VP9_HIGHBITDEPTH

 ifeq ($(HAVE_NEON_ASM),yes)

@@ -350,6 +355,9 @@

 DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h

 DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h

 DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h

+# X86 utilities

+DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h

 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -629,39 +629,39 @@

   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.

   specialize qw/vpx_iwht4x4_16_add sse2/;

-  add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   specialize qw/vpx_highbd_idct4x4_1_add neon/;

-  add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   specialize qw/vpx_highbd_idct8x8_1_add neon/;

-  add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   specialize qw/vpx_highbd_idct16x16_1_add neon/;

-  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;

-  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

-  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";

+  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";

   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {

     specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

@@ -1,0 +1,244 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                       int stride, int bd) {

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[32];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i rounding = _mm_set1_epi16(32);

+  const __m128i max = _mm_set1_epi16(3155);

+  const __m128i min = _mm_set1_epi16(-3155);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 16; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 32; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct16_sse2(inptr, inptr + 16);

+    // Find the min & max for the column transform

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 32; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      array_transpose_16x16(inptr, inptr + 16);

+      for (i = 0; i < 16; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 16; ++i) {

+      vpx_highbd_idct16_c(input, outptr, bd);

+      input += 16;

+      outptr += 16;

+    }

+  }

+  if (optimised_cols) {

+    idct16_sse2(inptr, inptr + 16);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[2];

+      for (i = 0; i < 16; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], rounding);

+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

+        inptr[i] = _mm_srai_epi16(inptr[i], 6);

+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[16], temp_out[16];

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

+      vpx_highbd_idct16_c(temp_in, temp_out, bd);

+      for (j = 0; j < 16; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+      }

+    }

+  }

+}

+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                      int stride, int bd) {

+  tran_low_t out[16 * 16] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[32];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i rounding = _mm_set1_epi16(32);

+  const __m128i max = _mm_set1_epi16(3155);

+  const __m128i min = _mm_set1_epi16(-3155);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 16; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  // Since all non-zero dct coefficients are in upper-left 4x4 area,

+  // we only need to consider first 4 rows here.

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 4; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform (N.B. This transposes inptr)

+    idct16_sse2(inptr, inptr + 16);

+    // Find the min & max for the column transform

+    // N.B. Only first 4 cols contain non-zero coeffs

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 16; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      // Use fact only first 4 rows contain non-zero coeffs

+      array_transpose_8x8(inptr, inptr);

+      array_transpose_8x8(inptr + 8, inptr + 16);

+      for (i = 0; i < 4; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vpx_highbd_idct16_c(input, outptr, bd);

+      input += 16;

+      outptr += 16;

+    }

+  }

+  if (optimised_cols) {

+    idct16_sse2(inptr, inptr + 16);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[2];

+      for (i = 0; i < 16; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], rounding);

+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

+        inptr[i] = _mm_srai_epi16(inptr[i], 6);

+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[16], temp_out[16];

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

+      vpx_highbd_idct16_c(temp_in, temp_out, bd);

+      for (j = 0; j < 16; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+      }

+    }

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c

@@ -1,0 +1,41 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                     int stride, int bd) {

+  __m128i dc_value, d;

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

+  int a, i, j;

+  tran_low_t out;

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

+  a = ROUND_POWER_OF_TWO(out, 6);

+  d = _mm_set1_epi32(a);

+  dc_value = _mm_packs_epi32(d, d);

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 4; ++j) {

+      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));

+      d = _mm_adds_epi16(d, dc_value);

+      d = _mm_max_epi16(d, zero);

+      d = _mm_min_epi16(d, max);

+      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);

+    }

+    dest += stride;

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

@@ -1,0 +1,129 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                    int stride, int bd) {

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  int i, j;

+  __m128i inptr[4];

+  __m128i sign_bits[2];

+  __m128i temp_mm, min_input, max_input;

+  int test;

+  int optimised_cols = 0;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i eight = _mm_set1_epi16(8);

+  const __m128i max = _mm_set1_epi16(12043);

+  const __m128i min = _mm_set1_epi16(-12043);

+  // Load input into __m128i

+  inptr[0] = _mm_loadu_si128((const __m128i *)input);

+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));

+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));

+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));

+  // Pack to 16 bits

+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);

+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp_mm = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp_mm);

+  if (!test) {

+    // Do the row transform

+    idct4_sse2(inptr);

+    // Check the min & max values

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp_mm = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp_mm);

+    if (test) {

+      transpose_4x4(inptr);

+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);

+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);

+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);

+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);

+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);

+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);

+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);

+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);

+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);

+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vpx_highbd_idct4_c(input, outptr, bd);

+      input += 4;

+      outptr += 4;

+    }

+  }

+  if (optimised_cols) {

+    idct4_sse2(inptr);

+    // Final round and shift

+    inptr[0] = _mm_add_epi16(inptr[0], eight);

+    inptr[1] = _mm_add_epi16(inptr[1], eight);

+    inptr[0] = _mm_srai_epi16(inptr[0], 4);

+    inptr[1] = _mm_srai_epi16(inptr[1], 4);

+    // Reconstruction and Store

+    {

+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);

+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));

+      d0 = _mm_unpacklo_epi64(

+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));

+      d2 = _mm_unpacklo_epi64(

+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));

+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);

+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);

+      // store input0

+      _mm_storel_epi64((__m128i *)dest, d0);

+      // store input1

+      d0 = _mm_srli_si128(d0, 8);

+      _mm_storel_epi64((__m128i *)(dest + stride), d0);

+      // store input2

+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);

+      // store input3

+      d2 = _mm_srli_si128(d2, 8);

+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[4], temp_out[4];

+    // Columns

+    for (i = 0; i < 4; ++i) {

+      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];

+      vpx_highbd_idct4_c(temp_in, temp_out, bd);

+      for (j = 0; j < 4; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

+      }

+    }

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

@@ -1,0 +1,216 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                    int stride, int bd) {

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[8];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i sixteen = _mm_set1_epi16(16);

+  const __m128i max = _mm_set1_epi16(6201);

+  const __m128i min = _mm_set1_epi16(-6201);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 8; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 8; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct8_sse2(inptr);

+    // Find the min & max for the column transform

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 8; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      array_transpose_8x8(inptr, inptr);

+      for (i = 0; i < 8; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 8; ++i) {

+      vpx_highbd_idct8_c(input, outptr, bd);

+      input += 8;

+      outptr += 8;

+    }

+  }

+  if (optimised_cols) {

+    idct8_sse2(inptr);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[8];

+      for (i = 0; i < 8; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        inptr[i] = _mm_srai_epi16(inptr[i], 5);

+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[8], temp_out[8];

+    for (i = 0; i < 8; ++i) {

+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

+      vpx_highbd_idct8_c(temp_in, temp_out, bd);

+      for (j = 0; j < 8; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+      }

+    }

+  }

+}

+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                    int stride, int bd) {

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[8];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i sixteen = _mm_set1_epi16(16);

+  const __m128i max = _mm_set1_epi16(6201);

+  const __m128i min = _mm_set1_epi16(-6201);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 8; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  // only first 4 row has non-zero coefs

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 4; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct8_sse2(inptr);

+    // Find the min & max for the column transform

+    // N.B. Only first 4 cols contain non-zero coeffs

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 8; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      // Use fact only first 4 rows contain non-zero coeffs

+      array_transpose_4X8(inptr, inptr);

+      for (i = 0; i < 4; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vpx_highbd_idct8_c(input, outptr, bd);

+      input += 8;

+      outptr += 8;

+    }

+  }

+  if (optimised_cols) {

+    idct8_sse2(inptr);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[8];

+      for (i = 0; i < 8; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        inptr[i] = _mm_srai_epi16(inptr[i], 5);

+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[8], temp_out[8];

+    for (i = 0; i < 8; ++i) {

+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

+      vpx_highbd_idct8_c(temp_in, temp_out, bd);

+      for (j = 0; j < 8; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+      }

+    }

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

@@ -1,0 +1,33 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_

+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_

+#include <emmintrin.h>  // SSE2

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {

+  __m128i ubounded, retval;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

+  ubounded = _mm_cmpgt_epi16(value, max);

+  retval = _mm_andnot_si128(ubounded, value);

+  ubounded = _mm_and_si128(ubounded, max);

+  retval = _mm_or_si128(retval, ubounded);

+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));

+  return retval;

+}

+#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_

--- a/vpx_dsp/x86/inv_txfm_sse2.c

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -10,6 +10,7 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

 #define RECON_AND_STORE4X4(dest, in_x)                    \

@@ -170,14 +171,6 @@

   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);

-static INLINE void transpose_4x4(__m128i *res) {

-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);

-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);

-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);

-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);

-}

 void idct4_sse2(__m128i *in) {

   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);

   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

@@ -3349,595 +3342,3 @@

     RECON_AND_STORE(dest + 24 + j * stride, dc_value);

-#if CONFIG_VP9_HIGHBITDEPTH

-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {

-  __m128i ubounded, retval;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

-  ubounded = _mm_cmpgt_epi16(value, max);

-  retval = _mm_andnot_si128(ubounded, value);

-  ubounded = _mm_and_si128(ubounded, max);

-  retval = _mm_or_si128(retval, ubounded);

-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));

-  return retval;

-}

-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                    int stride, int bd) {

-  tran_low_t out[4 * 4];

-  tran_low_t *outptr = out;

-  int i, j;

-  __m128i inptr[4];

-  __m128i sign_bits[2];

-  __m128i temp_mm, min_input, max_input;

-  int test;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  int optimised_cols = 0;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i eight = _mm_set1_epi16(8);

-  const __m128i max = _mm_set1_epi16(12043);

-  const __m128i min = _mm_set1_epi16(-12043);

-  // Load input into __m128i

-  inptr[0] = _mm_loadu_si128((const __m128i *)input);

-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));

-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));

-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));

-  // Pack to 16 bits

-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);

-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp_mm = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp_mm);

-  if (!test) {

-    // Do the row transform

-    idct4_sse2(inptr);

-    // Check the min & max values

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp_mm = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp_mm);

-    if (test) {

-      transpose_4x4(inptr);

-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);

-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);

-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);

-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);

-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);

-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);

-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);

-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);

-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);

-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vpx_highbd_idct4_c(input, outptr, bd);

-      input += 4;

-      outptr += 4;

-    }

-  }

-  if (optimised_cols) {

-    idct4_sse2(inptr);

-    // Final round and shift

-    inptr[0] = _mm_add_epi16(inptr[0], eight);

-    inptr[1] = _mm_add_epi16(inptr[1], eight);

-    inptr[0] = _mm_srai_epi16(inptr[0], 4);

-    inptr[1] = _mm_srai_epi16(inptr[1], 4);

-    // Reconstruction and Store

-    {

-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);

-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));

-      d0 = _mm_unpacklo_epi64(

-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));

-      d2 = _mm_unpacklo_epi64(

-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));

-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);

-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);

-      // store input0

-      _mm_storel_epi64((__m128i *)dest, d0);

-      // store input1

-      d0 = _mm_srli_si128(d0, 8);

-      _mm_storel_epi64((__m128i *)(dest + stride), d0);

-      // store input2

-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);

-      // store input3

-      d2 = _mm_srli_si128(d2, 8);

-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[4], temp_out[4];

-    // Columns

-    for (i = 0; i < 4; ++i) {

-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];

-      vpx_highbd_idct4_c(temp_in, temp_out, bd);

-      for (j = 0; j < 4; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                    int stride, int bd) {

-  tran_low_t out[8 * 8];

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[8];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i sixteen = _mm_set1_epi16(16);

-  const __m128i max = _mm_set1_epi16(6201);

-  const __m128i min = _mm_set1_epi16(-6201);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 8; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 8; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct8_sse2(inptr);

-    // Find the min & max for the column transform

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 8; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      array_transpose_8x8(inptr, inptr);

-      for (i = 0; i < 8; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 8; ++i) {

-      vpx_highbd_idct8_c(input, outptr, bd);

-      input += 8;

-      outptr += 8;

-    }

-  }

-  if (optimised_cols) {

-    idct8_sse2(inptr);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[8];

-      for (i = 0; i < 8; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        inptr[i] = _mm_srai_epi16(inptr[i], 5);

-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[8], temp_out[8];

-    for (i = 0; i < 8; ++i) {

-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

-      vpx_highbd_idct8_c(temp_in, temp_out, bd);

-      for (j = 0; j < 8; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                    int stride, int bd) {

-  tran_low_t out[8 * 8] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[8];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i sixteen = _mm_set1_epi16(16);

-  const __m128i max = _mm_set1_epi16(6201);

-  const __m128i min = _mm_set1_epi16(-6201);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 8; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  // only first 4 row has non-zero coefs

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 4; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct8_sse2(inptr);

-    // Find the min & max for the column transform

-    // N.B. Only first 4 cols contain non-zero coeffs

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 8; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      // Use fact only first 4 rows contain non-zero coeffs

-      array_transpose_4X8(inptr, inptr);

-      for (i = 0; i < 4; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vpx_highbd_idct8_c(input, outptr, bd);

-      input += 8;

-      outptr += 8;

-    }

-  }

-  if (optimised_cols) {

-    idct8_sse2(inptr);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[8];

-      for (i = 0; i < 8; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        inptr[i] = _mm_srai_epi16(inptr[i], 5);

-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[8], temp_out[8];

-    for (i = 0; i < 8; ++i) {

-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

-      vpx_highbd_idct8_c(temp_in, temp_out, bd);

-      for (j = 0; j < 8; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                       int stride, int bd) {

-  tran_low_t out[16 * 16];

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[32];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_set1_epi16(32);

-  const __m128i max = _mm_set1_epi16(3155);

-  const __m128i min = _mm_set1_epi16(-3155);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 16; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 32; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct16_sse2(inptr, inptr + 16);

-    // Find the min & max for the column transform

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 32; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      array_transpose_16x16(inptr, inptr + 16);

-      for (i = 0; i < 16; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 16; ++i) {

-      vpx_highbd_idct16_c(input, outptr, bd);

-      input += 16;

-      outptr += 16;

-    }

-  }

-  if (optimised_cols) {

-    idct16_sse2(inptr, inptr + 16);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[2];

-      for (i = 0; i < 16; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], rounding);

-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

-        inptr[i] = _mm_srai_epi16(inptr[i], 6);

-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[16], temp_out[16];

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

-      vpx_highbd_idct16_c(temp_in, temp_out, bd);

-      for (j = 0; j < 16; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                      int stride, int bd) {

-  tran_low_t out[16 * 16] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[32];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_set1_epi16(32);

-  const __m128i max = _mm_set1_epi16(3155);

-  const __m128i min = _mm_set1_epi16(-3155);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 16; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  // Since all non-zero dct coefficients are in upper-left 4x4 area,

-  // we only need to consider first 4 rows here.

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 4; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform (N.B. This transposes inptr)

-    idct16_sse2(inptr, inptr + 16);

-    // Find the min & max for the column transform

-    // N.B. Only first 4 cols contain non-zero coeffs

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 16; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      // Use fact only first 4 rows contain non-zero coeffs

-      array_transpose_8x8(inptr, inptr);

-      array_transpose_8x8(inptr + 8, inptr + 16);

-      for (i = 0; i < 4; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vpx_highbd_idct16_c(input, outptr, bd);

-      input += 16;

-      outptr += 16;

-    }

-  }

-  if (optimised_cols) {

-    idct16_sse2(inptr, inptr + 16);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[2];

-      for (i = 0; i < 16; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], rounding);

-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

-        inptr[i] = _mm_srai_epi16(inptr[i], 6);

-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[16], temp_out[16];

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

-      vpx_highbd_idct16_c(temp_in, temp_out, bd);

-      for (j = 0; j < 16; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                     int stride, int bd) {

-  __m128i dc_value, d;

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

-  int a, i, j;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  tran_low_t out;

-  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

-  a = ROUND_POWER_OF_TWO(out, 6);

-  d = _mm_set1_epi32(a);

-  dc_value = _mm_packs_epi32(d, d);

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 4; ++j) {

-      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));

-      d = _mm_adds_epi16(d, dc_value);

-      d = _mm_max_epi16(d, zero);

-      d = _mm_min_epi16(d, max);

-      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);

-    }

-    dest += stride;

-  }

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

--- /dev/null

+++ b/vpx_dsp/x86/transpose_sse2.h

@@ -1,0 +1,26 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_

+#define VPX_DSP_X86_TRANSPOSE_SSE2_H_

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+static INLINE void transpose_4x4(__m128i *res) {

+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);

+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);

+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);

+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);

+}

+#endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_