shithub: libvpx

Download patch

ref: f62dcc9c334eb5060293cbf0cfd7de82bb4ea78c
parent: b5bc9ee02dce7a129fa363d4f887dc41bb37c9fd
author: Yi Luo <[email protected]>
date: Thu Feb 16 08:15:22 EST 2017

Replace idct32x32_1024_add_ssse3 assembly with intrinsics

- Encoding/decoding test, BQTerrace_1920x1080_60.y4m, on
  i7-6700, no obvious user-level speed performance downgrade.
- Passed unit tests.

Change-Id: I20688e0dd3731021ec8fb4404734336f1a426bfc

--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -285,7 +285,7 @@
   }
 }
 
-TEST_P(PartialIDctTest, DISABLED_Speed) {
+TEST_P(PartialIDctTest, Speed) {
   // Keep runtime stable with transform size.
   const int kCountSpeedTestBlock = 500000000 / input_block_size_;
   InitMem();
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -203,9 +203,6 @@
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
-endif  # ARCH_X86_64
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
 
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -803,7 +803,7 @@
     specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+    specialize qw/vpx_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/;
 
     add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
     specialize qw/vpx_idct32x32_135_add sse2 ssse3 neon dspr2 msa/;
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -1222,3 +1222,465 @@
   idct32_135(col2, col3);
   recon_and_store(col2, col3, dest + 16, stride);
 }
+
+// For each 8x32 block __m128i in[32], output __m128i in[32]
+static void idct32_8x32(const __m128i *in, __m128i *out) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  /* Stage1 */
+  {
+    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);
+    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);
+    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);
+    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);
+
+    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);
+    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);
+    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);
+    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);
+
+    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);
+    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);
+    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);
+    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);
+
+    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);
+    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);
+    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);
+    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);
+
+    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, stg1_1,
+                           stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, stp1_30)
+    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5,
+                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28)
+    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, stg1_9,
+                           stg1_10, stg1_11, stp1_20, stp1_27, stp1_21, stp1_26)
+    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
+                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, stp1_23,
+                           stp1_24)
+  }
+
+  /* Stage2 */
+  {
+    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);
+    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);
+    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);
+    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);
+
+    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);
+    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);
+    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);
+    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);
+
+    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, stg2_1,
+                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)
+    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, stg2_5,
+                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12)
+
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
+    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
+    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
+    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
+
+    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
+    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
+    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
+    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
+
+    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
+    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
+    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
+    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
+
+    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
+    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
+    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
+    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
+  }
+
+  /* Stage3 */
+  {
+    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);
+    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);
+    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);
+    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);
+
+    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
+    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
+    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+
+    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, stg3_1,
+                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)
+
+    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+
+    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
+                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,
+                           stp1_29)
+    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
+                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22,
+                           stp1_25)
+
+    stp1_16 = stp2_16;
+    stp1_31 = stp2_31;
+    stp1_19 = stp2_19;
+    stp1_20 = stp2_20;
+    stp1_23 = stp2_23;
+    stp1_24 = stp2_24;
+    stp1_27 = stp2_27;
+    stp1_28 = stp2_28;
+  }
+
+  /* Stage4 */
+  {
+    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);
+    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);
+    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);
+    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);
+
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1,
+                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)
+
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, stg4_5,
+                           stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, stp2_13)
+
+    stp2_8 = stp1_8;
+    stp2_15 = stp1_15;
+    stp2_11 = stp1_11;
+    stp2_12 = stp1_12;
+
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
+    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
+    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
+    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
+    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
+    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
+    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
+    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
+
+    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
+    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
+    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
+    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
+    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
+    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
+    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
+    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
+  }
+
+  /* Stage5 */
+  {
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
+    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
+    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1);
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+    stp1_4 = stp2_4;
+    stp1_7 = stp2_7;
+
+    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
+    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
+
+    stp1_16 = stp2_16;
+    stp1_17 = stp2_17;
+
+    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
+                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,
+                           stp1_28)
+    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
+                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,
+                           stp1_26)
+
+    stp1_22 = stp2_22;
+    stp1_23 = stp2_23;
+    stp1_24 = stp2_24;
+    stp1_25 = stp2_25;
+    stp1_30 = stp2_30;
+    stp1_31 = stp2_31;
+  }
+
+  /* Stage6 */
+  {
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
+
+    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
+    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
+
+    stp2_8 = stp1_8;
+    stp2_9 = stp1_9;
+    stp2_14 = stp1_14;
+    stp2_15 = stp1_15;
+
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,
+                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,
+                           stp2_12)
+
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
+    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
+    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
+    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
+    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
+    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
+    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
+    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
+
+    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
+    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
+    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
+    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
+    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
+    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
+    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
+    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
+  }
+
+  /* Stage7 */
+  {
+    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
+    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
+
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
+    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
+    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
+    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
+    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
+    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
+    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
+    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
+    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
+    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
+    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
+    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
+    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
+    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
+    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
+
+    stp1_16 = stp2_16;
+    stp1_17 = stp2_17;
+    stp1_18 = stp2_18;
+    stp1_19 = stp2_19;
+
+    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
+                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,
+                           stp1_26)
+    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
+                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,
+                           stp1_24)
+
+    stp1_28 = stp2_28;
+    stp1_29 = stp2_29;
+    stp1_30 = stp2_30;
+    stp1_31 = stp2_31;
+  }
+
+  out[0] = _mm_add_epi16(stp1_0, stp1_31);
+  out[1] = _mm_add_epi16(stp1_1, stp1_30);
+  out[2] = _mm_add_epi16(stp1_2, stp1_29);
+  out[3] = _mm_add_epi16(stp1_3, stp1_28);
+  out[4] = _mm_add_epi16(stp1_4, stp1_27);
+  out[5] = _mm_add_epi16(stp1_5, stp1_26);
+  out[6] = _mm_add_epi16(stp1_6, stp1_25);
+  out[7] = _mm_add_epi16(stp1_7, stp1_24);
+  out[8] = _mm_add_epi16(stp1_8, stp1_23);
+  out[9] = _mm_add_epi16(stp1_9, stp1_22);
+  out[10] = _mm_add_epi16(stp1_10, stp1_21);
+  out[11] = _mm_add_epi16(stp1_11, stp1_20);
+  out[12] = _mm_add_epi16(stp1_12, stp1_19);
+  out[13] = _mm_add_epi16(stp1_13, stp1_18);
+  out[14] = _mm_add_epi16(stp1_14, stp1_17);
+  out[15] = _mm_add_epi16(stp1_15, stp1_16);
+  out[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  out[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  out[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  out[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  out[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  out[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  out[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  out[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  out[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  out[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  out[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  out[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  out[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  out[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  out[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  out[31] = _mm_sub_epi16(stp1_0, stp1_31);
+}
+
+static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+  int i;
+  for (i = 0; i < 8; ++i) {
+    in[i] = load_input_data(input);
+    in[i + 8] = load_input_data(input + 8);
+    in[i + 16] = load_input_data(input + 16);
+    in[i + 24] = load_input_data(input + 24);
+    input += 32;
+  }
+}
+
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                                  int stride) {
+  __m128i col[128], in[32];
+  int i, j;
+
+  // rows
+  for (i = 0; i < 4; ++i) {
+    load_buffer_8x32(input, in);
+    input += 32 << 3;
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+    array_transpose_8x8(in + 16, in + 16);
+    array_transpose_8x8(in + 24, in + 24);
+
+    idct32_8x32(in, col + (i << 5));
+  }
+
+  // columns
+  for (i = 0; i < 4; ++i) {
+    j = i << 3;
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + j, in);
+    array_transpose_8x8(col + j + 32, in + 8);
+    array_transpose_8x8(col + j + 64, in + 16);
+    array_transpose_8x8(col + j + 96, in + 24);
+
+    idct32_8x32(in, in);
+    store_buffer_8x32(in, dest, stride);
+    dest += 8;
+  }
+}
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1412 +1,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2:  times 8 dw  -2404*2
-pw_m4756x2:  times 8 dw  -4756*2
-pw_m5520x2:  times 8 dw  -5520*2
-pw_m8423x2:  times 8 dw  -8423*2
-pw_m9102x2:  times 8 dw  -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw  9760*2
-pw__7723x2: times 8 dw  7723*2
-pw__7005x2: times 8 dw  7005*2
-pw__6270x2: times 8 dw  6270*2
-pw__3981x2: times 8 dw  3981*2
-pw__3196x2: times 8 dw  3196*2
-pw__1606x2: times 8 dw  1606*2
-pw___804x2: times 8 dw   804*2
-
-pd_8192:    times 4 dd 8192
-pw_32:      times 8 dw 32
-pw_16:      times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS    6270, 15137
-TRANSFORM_COEFFS    3196, 16069
-TRANSFORM_COEFFS   13623,  9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS      804, 16364
-TRANSFORM_COEFFS    15426,  5520
-TRANSFORM_COEFFS     3981, 15893
-TRANSFORM_COEFFS    16207,  2404
-TRANSFORM_COEFFS     1606, 16305
-TRANSFORM_COEFFS    15679,  4756
-TRANSFORM_COEFFS    11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS    12140, 11003
-TRANSFORM_COEFFS     7005, 14811
-TRANSFORM_COEFFS    14053,  8423
-TRANSFORM_COEFFS     9760, 13160
-TRANSFORM_COEFFS    12665, 10394
-TRANSFORM_COEFFS     7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS     30274, 12540
-PAIR_PP_COEFFS      6392, 32138
-PAIR_MP_COEFFS     18204, 27246
-
-PAIR_PP_COEFFS     12540, 12540
-PAIR_PP_COEFFS     30274, 30274
-PAIR_PP_COEFFS      6392,  6392
-PAIR_PP_COEFFS     32138, 32138
-PAIR_MM_COEFFS     18204, 18204
-PAIR_PP_COEFFS     27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; 8x8 transpose. This follows same operations as SSE2 does.
-%macro TRANSPOSE8X8 10
-  ; stage 1
-  punpcklwd  m%9, m%1, m%2
-  punpcklwd  m%10, m%3, m%4
-  punpckhwd  m%1, m%2
-  punpckhwd  m%3, m%4
-
-  punpcklwd  m%2, m%5, m%6
-  punpcklwd  m%4, m%7, m%8
-  punpckhwd  m%5, m%6
-  punpckhwd  m%7, m%8
-
-  ; stage 2
-  punpckldq  m%6, m%9, m%10
-  punpckldq  m%8, m%1, m%3
-  punpckhdq  m%9, m%10
-  punpckhdq  m%1, m%3
-
-  punpckldq  m%10, m%2, m%4
-  punpckldq  m%3, m%5, m%7
-  punpckhdq  m%2, m%4
-  punpckhdq  m%5, m%7
-
-  ; stage 3
-  punpckhqdq  m%4, m%9, m%2  ; out3
-  punpcklqdq  m%9, m%2       ; out2
-  punpcklqdq  m%7, m%1, m%5  ; out6
-  punpckhqdq  m%1, m%5       ; out7
-
-  punpckhqdq  m%2, m%6, m%10 ; out1
-  punpcklqdq  m%6, m%10      ; out0
-  punpcklqdq  m%5, m%8, m%3  ; out4
-  punpckhqdq  m%8, m%3       ; out5
-
-  SWAP %6, %1
-  SWAP %3, %9
-  SWAP %8, %6
-%endmacro
-
-%macro IDCT8_1D 0
-  SUM_SUB          0,    4,    9
-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
-  pmulhrsw        m0,  m12
-  pmulhrsw        m4,  m12
-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
-
-  SUM_SUB          1,    5,    9
-  SUM_SUB          7,    3,    9
-  SUM_SUB          0,    6,    9
-  SUM_SUB          4,    2,    9
-  SUM_SUB          3,    5,    9
-  pmulhrsw        m3,  m12
-  pmulhrsw        m5,  m12
-
-  SUM_SUB          0,    7,    9
-  SUM_SUB          4,    3,    9
-  SUM_SUB          2,    5,    9
-  SUM_SUB          6,    1,    9
-
-  SWAP             3,    6
-  SWAP             1,    4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
-  paddw           m%1, m11
-  paddw           m%2, m11
-  psraw           m%1, 5
-  psraw           m%2, 5
-
-  movh            m%3, [outputq]
-  movh            m%4, [outputq + strideq]
-  punpcklbw       m%3, m%5
-  punpcklbw       m%4, m%5
-  paddw           m%3, m%1
-  paddw           m%4, m%2
-  packuswb        m%3, m%5
-  packuswb        m%4, m%5
-  movh               [outputq], m%3
-  movh     [outputq + strideq], m%4
-%endmacro
-
-%define  idx0 16 * 0
-%define  idx1 16 * 1
-%define  idx2 16 * 2
-%define  idx3 16 * 3
-%define  idx4 16 * 4
-%define  idx5 16 * 5
-%define  idx6 16 * 6
-%define  idx7 16 * 7
-%define  idx8 16 * 0
-%define  idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  mova      [r4 +      0], m0
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-  mova      [r4 + 16 * 2], m2
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  mova      [r4 + 16 * 4], m4
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-  mova      [r4 + 16 * 6], m6
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, m1   ; stp1_16
-  mova                 m0, m11  ; stp1_31
-  mova                 m4, m7   ; stp1_28
-  mova                m15, m12  ; stp1_19
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m15
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-  mova [stp + %4 + idx30], m2
-  mova                 m2, m3
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  mova [stp + %4 + idx31], m11
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m13, m5 ; stp1_20
-  mova                m14, m6 ; stp1_27
-  mova                m15, m3 ; stp1_23
-  mova                m11, m2 ; stp1_24
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22
-  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m10, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m10
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5, 9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB              11, 15,  9
-  pmulhrsw            m11, m10  ; stp1_25
-  pmulhrsw            m15, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_24
-  pmulhrsw             m3, m10  ; stp1_23
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP 6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP 11, 15
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP 2, 3
-%endif
-
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m11
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-  mova [stp + %3 + idx22], m15
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  mova [stp + %3 + idx23], m3
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m3, m0 ; stp1_8
-  mova                 m2, m1 ; stp1_15
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  mova                 m4, m7 ; stp1_11
-  mova                 m5, m6 ; stp1_12
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,  4, 9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,  7, 9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP 5, 4
-  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP 6, 7
-%endif
-
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                m10, [pw_11585x2]
-  pmulhrsw             m0, m10  ; stp1_1
-
-  mova                m14, m11 ; stp1_4
-  mova                m13, m12 ; stp1_7
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m7, m0 ; stp1_0 = stp1_1
-  mova                 m4, m0 ; stp1_1
-  mova                 m2, m7 ; stp1_0
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15
-  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m15, [stp + %4 + idx30]
-  mova                m10, [stp + %4 + idx31]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m7
-  mova [stp + %4 + idx30], m15
-  mova [stp + %4 + idx31], m10
-  mova                 m7, [stp + %4 + idx28]
-  mova                 m0, [stp + %4 + idx29]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m4
-  mova [stp + %4 + idx28], m7
-  mova [stp + %4 + idx29], m0
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m4, [stp + %3 + idx19]
-  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m4
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m0, [stp + %4 + idx27]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m3, [stp + %4 + idx24]
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  mova [stp + %4 + idx27], m0
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx24], m3
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
-  mova            m11, [pw_32]
-  lea             stp, [rsp + %1]
-  mov              r6, 32
-  pxor             m8, m8
-%%recon_and_store:
-  mova             m0, [stp + 16 * 32 * 0]
-  mova             m1, [stp + 16 * 32 * 1]
-  mova             m2, [stp + 16 * 32 * 2]
-  mova             m3, [stp + 16 * 32 * 3]
-  add             stp, 16
-
-  paddw            m0, m11
-  paddw            m1, m11
-  paddw            m2, m11
-  paddw            m3, m11
-  psraw            m0, 6
-  psraw            m1, 6
-  psraw            m2, 6
-  psraw            m3, 6
-  movh             m4, [outputq +  0]
-  movh             m5, [outputq +  8]
-  movh             m6, [outputq + 16]
-  movh             m7, [outputq + 24]
-  punpcklbw        m4, m8
-  punpcklbw        m5, m8
-  punpcklbw        m6, m8
-  punpcklbw        m7, m8
-  paddw            m0, m4
-  paddw            m1, m5
-  paddw            m2, m6
-  paddw            m3, m7
-  packuswb         m0, m1
-  packuswb         m2, m3
-  mova [outputq +  0], m0
-  mova [outputq + 16], m2
-  lea         outputq, [outputq + strideq]
-  dec              r6
-  jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size     16*32*5
-%define pass_two_start  16*32*0
-%define transposed_in   16*32*4
-%define pass_one_start  16*32*0
-%define stp r8
-%macro IDCT32X32_135 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, m3
-  pmulhrsw             m3, [pw__7005x2] ; stp1_18
-  pmulhrsw             m4, [pw_14811x2] ; stp2_29
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, m0
-  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
-  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  mova                m13, m14
-  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
-  pmulhrsw            m14, [pw_14053x2] ; stp2_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__9760x2] ; stp1_22
-  pmulhrsw             m1, [pw_13160x2] ; stp2_25
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, m4
-  pmulhrsw             m4, [pw__7723x2] ; stp1_10
-  pmulhrsw             m5, [pw_14449x2] ; stp2_13
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
-  pmulhrsw             m2, [pw_12665x2] ; stp2_14
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, m13
-  pmulhrsw            m13, [pw_13623x2] ; stp1_6
-  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
-  mova                 m3, m2
-  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
-  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
-
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m1, m0    ; stp1_0 = stp1_1
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro IDCT32X32_1024 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, [rsp + transposed_in + 16 * 31]
-  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, [rsp + transposed_in + 16 * 17]
-  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, [rsp + transposed_in + 16 * 25]
-  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, [rsp + transposed_in + 16 * 23]
-  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, [rsp + transposed_in + 16 * 27]
-  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
-
-  mova                m13, [rsp + transposed_in + 16 * 21]
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, [rsp + transposed_in + 16 * 19]
-  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
-
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, [rsp + transposed_in + 16 * 29]
-  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, [rsp + transposed_in + 16 * 30]
-  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, [rsp + transposed_in + 16 * 18]
-  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, [rsp + transposed_in + 16 * 22]
-  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, [rsp + transposed_in + 16 * 26]
-  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, [rsp + transposed_in + 16 * 28]
-  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, [rsp + transposed_in + 16 * 20]
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               0,    1,  9
-  pmulhrsw             m0, m10  ; stp1_1
-  pmulhrsw             m1, m10  ; stp1_0
-%else
-  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
-  SWAP  0, 1
-%endif
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  mova                 m3, [rsp + transposed_in + 16 * 24]
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
-
-  mova                m10, [pw_11585x2]
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_1024:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 4
-
-idct32x32_1024_transpose:
-  LOAD_TRAN_LOW 0, r3,   0
-  LOAD_TRAN_LOW 1, r3,  32
-  LOAD_TRAN_LOW 2, r3,  64
-  LOAD_TRAN_LOW 3, r3,  96
-  LOAD_TRAN_LOW 4, r3, 128
-  LOAD_TRAN_LOW 5, r3, 160
-  LOAD_TRAN_LOW 6, r3, 192
-  LOAD_TRAN_LOW 7, r3, 224
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-  INCREMENT_TRAN_LOW r3
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose
-
-  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
-  lea            stp, [stp + 16 * 8]
-  INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
-  dec             r6
-  jnz idct32x32_1024
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 4
-
-idct32x32_1024_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose_2
-
-  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_1024_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-%endif