shithub: libvpx

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -231,6 +231,11 @@

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_34_add_neon.c

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_135_add_neon.c

 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_1024_add_neon.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_inv_txfm_sse2.h

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c

 endif  # !CONFIG_VP9_HIGHBITDEPTH

 ifeq ($(HAVE_NEON_ASM),yes)

@@ -350,6 +355,9 @@

 DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h

 DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h

 DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h

+# X86 utilities

+DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h

 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

@@ -1,0 +1,244 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                       int stride, int bd) {

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[32];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i rounding = _mm_set1_epi16(32);

+  const __m128i max = _mm_set1_epi16(3155);

+  const __m128i min = _mm_set1_epi16(-3155);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 16; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 32; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct16_sse2(inptr, inptr + 16);

+    // Find the min & max for the column transform

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 32; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      array_transpose_16x16(inptr, inptr + 16);

+      for (i = 0; i < 16; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 16; ++i) {

+      vpx_highbd_idct16_c(input, outptr, bd);

+      input += 16;

+      outptr += 16;

+    }

+  }

+  if (optimised_cols) {

+    idct16_sse2(inptr, inptr + 16);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[2];

+      for (i = 0; i < 16; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], rounding);

+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

+        inptr[i] = _mm_srai_epi16(inptr[i], 6);

+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[16], temp_out[16];

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

+      vpx_highbd_idct16_c(temp_in, temp_out, bd);

+      for (j = 0; j < 16; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+      }

+    }

+  }

+}

+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                      int stride, int bd) {

+  tran_low_t out[16 * 16] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[32];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i rounding = _mm_set1_epi16(32);

+  const __m128i max = _mm_set1_epi16(3155);

+  const __m128i min = _mm_set1_epi16(-3155);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 16; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  // Since all non-zero dct coefficients are in upper-left 4x4 area,

+  // we only need to consider first 4 rows here.

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 4; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform (N.B. This transposes inptr)

+    idct16_sse2(inptr, inptr + 16);

+    // Find the min & max for the column transform

+    // N.B. Only first 4 cols contain non-zero coeffs

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 16; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      // Use fact only first 4 rows contain non-zero coeffs

+      array_transpose_8x8(inptr, inptr);

+      array_transpose_8x8(inptr + 8, inptr + 16);

+      for (i = 0; i < 4; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vpx_highbd_idct16_c(input, outptr, bd);

+      input += 16;

+      outptr += 16;

+    }

+  }

+  if (optimised_cols) {

+    idct16_sse2(inptr, inptr + 16);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[2];

+      for (i = 0; i < 16; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], rounding);

+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

+        inptr[i] = _mm_srai_epi16(inptr[i], 6);

+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[16], temp_out[16];

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

+      vpx_highbd_idct16_c(temp_in, temp_out, bd);

+      for (j = 0; j < 16; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+      }

+    }

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c

@@ -1,0 +1,41 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                     int stride, int bd) {

+  __m128i dc_value, d;

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

+  int a, i, j;

+  tran_low_t out;

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

+  a = ROUND_POWER_OF_TWO(out, 6);

+  d = _mm_set1_epi32(a);

+  dc_value = _mm_packs_epi32(d, d);

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 4; ++j) {

+      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));

+      d = _mm_adds_epi16(d, dc_value);

+      d = _mm_max_epi16(d, zero);

+      d = _mm_min_epi16(d, max);

+      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);

+    }

+    dest += stride;

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

@@ -1,0 +1,129 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                    int stride, int bd) {

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  int i, j;

+  __m128i inptr[4];

+  __m128i sign_bits[2];

+  __m128i temp_mm, min_input, max_input;

+  int test;

+  int optimised_cols = 0;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i eight = _mm_set1_epi16(8);

+  const __m128i max = _mm_set1_epi16(12043);

+  const __m128i min = _mm_set1_epi16(-12043);

+  // Load input into __m128i

+  inptr[0] = _mm_loadu_si128((const __m128i *)input);

+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));

+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));

+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));

+  // Pack to 16 bits

+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);

+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp_mm = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp_mm);

+  if (!test) {

+    // Do the row transform

+    idct4_sse2(inptr);

+    // Check the min & max values

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp_mm = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp_mm);

+    if (test) {

+      transpose_4x4(inptr);

+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);

+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);

+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);

+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);

+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);

+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);

+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);

+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);

+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);

+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vpx_highbd_idct4_c(input, outptr, bd);

+      input += 4;

+      outptr += 4;

+    }

+  }

+  if (optimised_cols) {

+    idct4_sse2(inptr);

+    // Final round and shift

+    inptr[0] = _mm_add_epi16(inptr[0], eight);

+    inptr[1] = _mm_add_epi16(inptr[1], eight);

+    inptr[0] = _mm_srai_epi16(inptr[0], 4);

+    inptr[1] = _mm_srai_epi16(inptr[1], 4);

+    // Reconstruction and Store

+    {

+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);

+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));

+      d0 = _mm_unpacklo_epi64(

+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));

+      d2 = _mm_unpacklo_epi64(

+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));

+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);

+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);

+      // store input0

+      _mm_storel_epi64((__m128i *)dest, d0);

+      // store input1

+      d0 = _mm_srli_si128(d0, 8);

+      _mm_storel_epi64((__m128i *)(dest + stride), d0);

+      // store input2

+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);

+      // store input3

+      d2 = _mm_srli_si128(d2, 8);

+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[4], temp_out[4];

+    // Columns

+    for (i = 0; i < 4; ++i) {

+      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];

+      vpx_highbd_idct4_c(temp_in, temp_out, bd);

+      for (j = 0; j < 4; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

+      }

+    }

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

@@ -1,0 +1,216 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                    int stride, int bd) {

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[8];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i sixteen = _mm_set1_epi16(16);

+  const __m128i max = _mm_set1_epi16(6201);

+  const __m128i min = _mm_set1_epi16(-6201);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 8; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 8; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct8_sse2(inptr);

+    // Find the min & max for the column transform

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 8; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      array_transpose_8x8(inptr, inptr);

+      for (i = 0; i < 8; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 8; ++i) {

+      vpx_highbd_idct8_c(input, outptr, bd);

+      input += 8;

+      outptr += 8;

+    }

+  }

+  if (optimised_cols) {

+    idct8_sse2(inptr);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[8];

+      for (i = 0; i < 8; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        inptr[i] = _mm_srai_epi16(inptr[i], 5);

+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[8], temp_out[8];

+    for (i = 0; i < 8; ++i) {

+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

+      vpx_highbd_idct8_c(temp_in, temp_out, bd);

+      for (j = 0; j < 8; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+      }

+    }

+  }

+}

+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,

+                                    int stride, int bd) {

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[8];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i sixteen = _mm_set1_epi16(16);

+  const __m128i max = _mm_set1_epi16(6201);

+  const __m128i min = _mm_set1_epi16(-6201);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 8; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  // only first 4 row has non-zero coefs

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 4; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct8_sse2(inptr);

+    // Find the min & max for the column transform

+    // N.B. Only first 4 cols contain non-zero coeffs

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 8; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      // Use fact only first 4 rows contain non-zero coeffs

+      array_transpose_4X8(inptr, inptr);

+      for (i = 0; i < 4; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vpx_highbd_idct8_c(input, outptr, bd);

+      input += 8;

+      outptr += 8;

+    }

+  }

+  if (optimised_cols) {

+    idct8_sse2(inptr);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[8];

+      for (i = 0; i < 8; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

+        inptr[i] = _mm_srai_epi16(inptr[i], 5);

+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[8], temp_out[8];

+    for (i = 0; i < 8; ++i) {

+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

+      vpx_highbd_idct8_c(temp_in, temp_out, bd);

+      for (j = 0; j < 8; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+      }

+    }

+  }

+}

--- /dev/null

+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

@@ -1,0 +1,33 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_

+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_

+#include <emmintrin.h>  // SSE2

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {

+  __m128i ubounded, retval;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

+  ubounded = _mm_cmpgt_epi16(value, max);

+  retval = _mm_andnot_si128(ubounded, value);

+  ubounded = _mm_and_si128(ubounded, max);

+  retval = _mm_or_si128(retval, ubounded);

+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));

+  return retval;

+}

+#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_

--- a/vpx_dsp/x86/inv_txfm_sse2.c

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -10,6 +10,7 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

 #define RECON_AND_STORE4X4(dest, in_x)                    \

@@ -170,14 +171,6 @@

   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);

-static INLINE void transpose_4x4(__m128i *res) {

-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);

-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);

-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);

-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);

-}

 void idct4_sse2(__m128i *in) {

   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);

   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

@@ -3349,589 +3342,3 @@

     RECON_AND_STORE(dest + 24 + j * stride, dc_value);

-#if CONFIG_VP9_HIGHBITDEPTH

-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {

-  __m128i ubounded, retval;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

-  ubounded = _mm_cmpgt_epi16(value, max);

-  retval = _mm_andnot_si128(ubounded, value);

-  ubounded = _mm_and_si128(ubounded, max);

-  retval = _mm_or_si128(retval, ubounded);

-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));

-  return retval;

-}

-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,

-                                    int stride, int bd) {

-  tran_low_t out[4 * 4];

-  tran_low_t *outptr = out;

-  int i, j;

-  __m128i inptr[4];

-  __m128i sign_bits[2];

-  __m128i temp_mm, min_input, max_input;

-  int test;

-  int optimised_cols = 0;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i eight = _mm_set1_epi16(8);

-  const __m128i max = _mm_set1_epi16(12043);

-  const __m128i min = _mm_set1_epi16(-12043);

-  // Load input into __m128i

-  inptr[0] = _mm_loadu_si128((const __m128i *)input);

-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));

-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));

-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));

-  // Pack to 16 bits

-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);

-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp_mm = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp_mm);

-  if (!test) {

-    // Do the row transform

-    idct4_sse2(inptr);

-    // Check the min & max values

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp_mm = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp_mm);

-    if (test) {

-      transpose_4x4(inptr);

-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);

-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);

-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);

-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);

-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);

-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);

-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);

-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);

-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);

-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vpx_highbd_idct4_c(input, outptr, bd);

-      input += 4;

-      outptr += 4;

-    }

-  }

-  if (optimised_cols) {

-    idct4_sse2(inptr);

-    // Final round and shift

-    inptr[0] = _mm_add_epi16(inptr[0], eight);

-    inptr[1] = _mm_add_epi16(inptr[1], eight);

-    inptr[0] = _mm_srai_epi16(inptr[0], 4);

-    inptr[1] = _mm_srai_epi16(inptr[1], 4);

-    // Reconstruction and Store

-    {

-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);

-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));

-      d0 = _mm_unpacklo_epi64(

-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));

-      d2 = _mm_unpacklo_epi64(

-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));

-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);

-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);

-      // store input0

-      _mm_storel_epi64((__m128i *)dest, d0);

-      // store input1

-      d0 = _mm_srli_si128(d0, 8);

-      _mm_storel_epi64((__m128i *)(dest + stride), d0);

-      // store input2

-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);

-      // store input3

-      d2 = _mm_srli_si128(d2, 8);

-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[4], temp_out[4];

-    // Columns

-    for (i = 0; i < 4; ++i) {

-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];

-      vpx_highbd_idct4_c(temp_in, temp_out, bd);

-      for (j = 0; j < 4; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,

-                                    int stride, int bd) {

-  tran_low_t out[8 * 8];

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[8];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i sixteen = _mm_set1_epi16(16);

-  const __m128i max = _mm_set1_epi16(6201);

-  const __m128i min = _mm_set1_epi16(-6201);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 8; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 8; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct8_sse2(inptr);

-    // Find the min & max for the column transform

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 8; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      array_transpose_8x8(inptr, inptr);

-      for (i = 0; i < 8; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 8; ++i) {

-      vpx_highbd_idct8_c(input, outptr, bd);

-      input += 8;

-      outptr += 8;

-    }

-  }

-  if (optimised_cols) {

-    idct8_sse2(inptr);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[8];

-      for (i = 0; i < 8; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        inptr[i] = _mm_srai_epi16(inptr[i], 5);

-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[8], temp_out[8];

-    for (i = 0; i < 8; ++i) {

-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

-      vpx_highbd_idct8_c(temp_in, temp_out, bd);

-      for (j = 0; j < 8; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,

-                                    int stride, int bd) {

-  tran_low_t out[8 * 8] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[8];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i sixteen = _mm_set1_epi16(16);

-  const __m128i max = _mm_set1_epi16(6201);

-  const __m128i min = _mm_set1_epi16(-6201);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 8; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  // only first 4 row has non-zero coefs

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 4; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct8_sse2(inptr);

-    // Find the min & max for the column transform

-    // N.B. Only first 4 cols contain non-zero coeffs

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 8; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      // Use fact only first 4 rows contain non-zero coeffs

-      array_transpose_4X8(inptr, inptr);

-      for (i = 0; i < 4; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vpx_highbd_idct8_c(input, outptr, bd);

-      input += 8;

-      outptr += 8;

-    }

-  }

-  if (optimised_cols) {

-    idct8_sse2(inptr);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[8];

-      for (i = 0; i < 8; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        inptr[i] = _mm_srai_epi16(inptr[i], 5);

-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[8], temp_out[8];

-    for (i = 0; i < 8; ++i) {

-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];

-      vpx_highbd_idct8_c(temp_in, temp_out, bd);

-      for (j = 0; j < 8; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,

-                                       int stride, int bd) {

-  tran_low_t out[16 * 16];

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[32];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_set1_epi16(32);

-  const __m128i max = _mm_set1_epi16(3155);

-  const __m128i min = _mm_set1_epi16(-3155);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 16; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 32; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct16_sse2(inptr, inptr + 16);

-    // Find the min & max for the column transform

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 32; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      array_transpose_16x16(inptr, inptr + 16);

-      for (i = 0; i < 16; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 16; ++i) {

-      vpx_highbd_idct16_c(input, outptr, bd);

-      input += 16;

-      outptr += 16;

-    }

-  }

-  if (optimised_cols) {

-    idct16_sse2(inptr, inptr + 16);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[2];

-      for (i = 0; i < 16; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], rounding);

-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

-        inptr[i] = _mm_srai_epi16(inptr[i], 6);

-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[16], temp_out[16];

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

-      vpx_highbd_idct16_c(temp_in, temp_out, bd);

-      for (j = 0; j < 16; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,

-                                      int stride, int bd) {

-  tran_low_t out[16 * 16] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[32];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_set1_epi16(32);

-  const __m128i max = _mm_set1_epi16(3155);

-  const __m128i min = _mm_set1_epi16(-3155);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 16; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  // Since all non-zero dct coefficients are in upper-left 4x4 area,

-  // we only need to consider first 4 rows here.

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 4; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform (N.B. This transposes inptr)

-    idct16_sse2(inptr, inptr + 16);

-    // Find the min & max for the column transform

-    // N.B. Only first 4 cols contain non-zero coeffs

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 16; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      // Use fact only first 4 rows contain non-zero coeffs

-      array_transpose_8x8(inptr, inptr);

-      array_transpose_8x8(inptr + 8, inptr + 16);

-      for (i = 0; i < 4; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vpx_highbd_idct16_c(input, outptr, bd);

-      input += 16;

-      outptr += 16;

-    }

-  }

-  if (optimised_cols) {

-    idct16_sse2(inptr, inptr + 16);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[2];

-      for (i = 0; i < 16; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], rounding);

-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);

-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));

-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));

-        inptr[i] = _mm_srai_epi16(inptr[i], 6);

-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);

-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);

-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);

-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[16], temp_out[16];

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];

-      vpx_highbd_idct16_c(temp_in, temp_out, bd);

-      for (j = 0; j < 16; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-      }

-    }

-  }

-}

-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,

-                                     int stride, int bd) {

-  __m128i dc_value, d;

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

-  int a, i, j;

-  tran_low_t out;

-  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

-  a = ROUND_POWER_OF_TWO(out, 6);

-  d = _mm_set1_epi32(a);

-  dc_value = _mm_packs_epi32(d, d);

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 4; ++j) {

-      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));

-      d = _mm_adds_epi16(d, dc_value);

-      d = _mm_max_epi16(d, zero);

-      d = _mm_min_epi16(d, max);

-      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);

-    }

-    dest += stride;

-  }

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

--- /dev/null

+++ b/vpx_dsp/x86/transpose_sse2.h

@@ -1,0 +1,26 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_

+#define VPX_DSP_X86_TRANSPOSE_SSE2_H_

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+static INLINE void transpose_4x4(__m128i *res) {

+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);

+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);

+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);

+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);

+}

+#endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_