shithub: libvpx

--- a/test/idct8x8_test.cc

+++ b/test/idct8x8_test.cc

@@ -14,8 +14,7 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

-#include "./vp9_rtcd.h"

+#include "./vpx_dsp_rtcd.h"

 #include "test/acm_random.h"

 #include "vpx/vpx_integer.h"

--- a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c

+++ /dev/null

@@ -1,61 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "vp9/common/vp9_idct.h"

-#include "vpx_ports/mem.h"

-void vp9_idct16x16_1_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8x8_t d2u8, d3u8, d30u8, d31u8;

-    uint64x1_t d2u64, d3u64, d4u64, d5u64;

-    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;

-    int16x8_t q0s16;

-    uint8_t *d1, *d2;

-    int16_t i, j, a1, cospi_16_64 = 11585;

-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-    out = dct_const_round_shift(out * cospi_16_64);

-    a1 = ROUND_POWER_OF_TWO(out, 6);

-    q0s16 = vdupq_n_s16(a1);

-    q0u16 = vreinterpretq_u16_s16(q0s16);

-    for (d1 = d2 = dest, i = 0; i < 4; i++) {

-        for (j = 0; j < 2; j++) {

-            d2u64 = vld1_u64((const uint64_t *)d1);

-            d3u64 = vld1_u64((const uint64_t *)(d1 + 8));

-            d1 += dest_stride;

-            d4u64 = vld1_u64((const uint64_t *)d1);

-            d5u64 = vld1_u64((const uint64_t *)(d1 + 8));

-            d1 += dest_stride;

-            q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));

-            q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));

-            q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));

-            q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));

-            d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-            d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

-            d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

-            d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

-            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

-            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));

-            d2 += dest_stride;

-            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));

-            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));

-            d2 += dest_stride;

-        }

-    }

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm

+++ /dev/null

@@ -1,198 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |vp9_idct16x16_1_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,

-;                                    int dest_stride)

-;

-; r0  int16_t input

-; r1  uint8_t *dest

-; r2  int dest_stride)

-|vp9_idct16x16_1_add_neon| PROC

-    ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

-    ; out = dct_const_round_shift(input[0] * cospi_16_64)

-    mul              r0, r0, r12               ; input[0] * cospi_16_64

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; out = dct_const_round_shift(out * cospi_16_64)

-    mul              r0, r0, r12               ; out * cospi_16_64

-    mov              r12, r1                   ; save dest

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; a1 = ROUND_POWER_OF_TWO(out, 6)

-    add              r0, r0, #32               ; + (1 <<((6) - 1))

-    asr              r0, r0, #6                ; >> 6

-    vdup.s16         q0, r0                    ; duplicate a1

-    mov              r0, #8

-    sub              r2, #8

-    ; load destination data row0 - row3

-    vld1.64          {d2}, [r1], r0

-    vld1.64          {d3}, [r1], r2

-    vld1.64          {d4}, [r1], r0

-    vld1.64          {d5}, [r1], r2

-    vld1.64          {d6}, [r1], r0

-    vld1.64          {d7}, [r1], r2

-    vld1.64          {d16}, [r1], r0

-    vld1.64          {d17}, [r1], r2

-    vaddw.u8         q9, q0, d2                ; dest[x] + a1

-    vaddw.u8         q10, q0, d3               ; dest[x] + a1

-    vaddw.u8         q11, q0, d4               ; dest[x] + a1

-    vaddw.u8         q12, q0, d5               ; dest[x] + a1

-    vqmovun.s16      d2, q9                    ; clip_pixel

-    vqmovun.s16      d3, q10                   ; clip_pixel

-    vqmovun.s16      d30, q11                  ; clip_pixel

-    vqmovun.s16      d31, q12                  ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

-    vaddw.u8         q10, q0, d7                ; dest[x] + a1

-    vaddw.u8         q11, q0, d16               ; dest[x] + a1

-    vaddw.u8         q12, q0, d17               ; dest[x] + a1

-    vqmovun.s16      d2, q9                     ; clip_pixel

-    vqmovun.s16      d3, q10                    ; clip_pixel

-    vqmovun.s16      d30, q11                   ; clip_pixel

-    vqmovun.s16      d31, q12                   ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    ; load destination data row4 - row7

-    vld1.64          {d2}, [r1], r0

-    vld1.64          {d3}, [r1], r2

-    vld1.64          {d4}, [r1], r0

-    vld1.64          {d5}, [r1], r2

-    vld1.64          {d6}, [r1], r0

-    vld1.64          {d7}, [r1], r2

-    vld1.64          {d16}, [r1], r0

-    vld1.64          {d17}, [r1], r2

-    vaddw.u8         q9, q0, d2                ; dest[x] + a1

-    vaddw.u8         q10, q0, d3               ; dest[x] + a1

-    vaddw.u8         q11, q0, d4               ; dest[x] + a1

-    vaddw.u8         q12, q0, d5               ; dest[x] + a1

-    vqmovun.s16      d2, q9                    ; clip_pixel

-    vqmovun.s16      d3, q10                   ; clip_pixel

-    vqmovun.s16      d30, q11                  ; clip_pixel

-    vqmovun.s16      d31, q12                  ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

-    vaddw.u8         q10, q0, d7                ; dest[x] + a1

-    vaddw.u8         q11, q0, d16               ; dest[x] + a1

-    vaddw.u8         q12, q0, d17               ; dest[x] + a1

-    vqmovun.s16      d2, q9                     ; clip_pixel

-    vqmovun.s16      d3, q10                    ; clip_pixel

-    vqmovun.s16      d30, q11                   ; clip_pixel

-    vqmovun.s16      d31, q12                   ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    ; load destination data row8 - row11

-    vld1.64          {d2}, [r1], r0

-    vld1.64          {d3}, [r1], r2

-    vld1.64          {d4}, [r1], r0

-    vld1.64          {d5}, [r1], r2

-    vld1.64          {d6}, [r1], r0

-    vld1.64          {d7}, [r1], r2

-    vld1.64          {d16}, [r1], r0

-    vld1.64          {d17}, [r1], r2

-    vaddw.u8         q9, q0, d2                ; dest[x] + a1

-    vaddw.u8         q10, q0, d3               ; dest[x] + a1

-    vaddw.u8         q11, q0, d4               ; dest[x] + a1

-    vaddw.u8         q12, q0, d5               ; dest[x] + a1

-    vqmovun.s16      d2, q9                    ; clip_pixel

-    vqmovun.s16      d3, q10                   ; clip_pixel

-    vqmovun.s16      d30, q11                  ; clip_pixel

-    vqmovun.s16      d31, q12                  ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

-    vaddw.u8         q10, q0, d7                ; dest[x] + a1

-    vaddw.u8         q11, q0, d16               ; dest[x] + a1

-    vaddw.u8         q12, q0, d17               ; dest[x] + a1

-    vqmovun.s16      d2, q9                     ; clip_pixel

-    vqmovun.s16      d3, q10                    ; clip_pixel

-    vqmovun.s16      d30, q11                   ; clip_pixel

-    vqmovun.s16      d31, q12                   ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    ; load destination data row12 - row15

-    vld1.64          {d2}, [r1], r0

-    vld1.64          {d3}, [r1], r2

-    vld1.64          {d4}, [r1], r0

-    vld1.64          {d5}, [r1], r2

-    vld1.64          {d6}, [r1], r0

-    vld1.64          {d7}, [r1], r2

-    vld1.64          {d16}, [r1], r0

-    vld1.64          {d17}, [r1], r2

-    vaddw.u8         q9, q0, d2                ; dest[x] + a1

-    vaddw.u8         q10, q0, d3               ; dest[x] + a1

-    vaddw.u8         q11, q0, d4               ; dest[x] + a1

-    vaddw.u8         q12, q0, d5               ; dest[x] + a1

-    vqmovun.s16      d2, q9                    ; clip_pixel

-    vqmovun.s16      d3, q10                   ; clip_pixel

-    vqmovun.s16      d30, q11                  ; clip_pixel

-    vqmovun.s16      d31, q12                  ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

-    vaddw.u8         q10, q0, d7                ; dest[x] + a1

-    vaddw.u8         q11, q0, d16               ; dest[x] + a1

-    vaddw.u8         q12, q0, d17               ; dest[x] + a1

-    vqmovun.s16      d2, q9                     ; clip_pixel

-    vqmovun.s16      d3, q10                    ; clip_pixel

-    vqmovun.s16      d30, q11                   ; clip_pixel

-    vqmovun.s16      d31, q12                   ; clip_pixel

-    vst1.64          {d2}, [r12], r0

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r0

-    vst1.64          {d31}, [r12], r2

-    bx               lr

-    ENDP             ; |vp9_idct16x16_1_add_neon|

-    END

--- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c

+++ /dev/null

@@ -1,1317 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "./vpx_config.h"

-#include "vpx_dsp/txfm_common.h"

-static INLINE void TRANSPOSE8X8(

-        int16x8_t *q8s16,

-        int16x8_t *q9s16,

-        int16x8_t *q10s16,

-        int16x8_t *q11s16,

-        int16x8_t *q12s16,

-        int16x8_t *q13s16,

-        int16x8_t *q14s16,

-        int16x8_t *q15s16) {

-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;

-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;

-    d16s16 = vget_low_s16(*q8s16);

-    d17s16 = vget_high_s16(*q8s16);

-    d18s16 = vget_low_s16(*q9s16);

-    d19s16 = vget_high_s16(*q9s16);

-    d20s16 = vget_low_s16(*q10s16);

-    d21s16 = vget_high_s16(*q10s16);

-    d22s16 = vget_low_s16(*q11s16);

-    d23s16 = vget_high_s16(*q11s16);

-    d24s16 = vget_low_s16(*q12s16);

-    d25s16 = vget_high_s16(*q12s16);

-    d26s16 = vget_low_s16(*q13s16);

-    d27s16 = vget_high_s16(*q13s16);

-    d28s16 = vget_low_s16(*q14s16);

-    d29s16 = vget_high_s16(*q14s16);

-    d30s16 = vget_low_s16(*q15s16);

-    d31s16 = vget_high_s16(*q15s16);

-    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24

-    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26

-    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28

-    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30

-    *q12s16 = vcombine_s16(d17s16, d25s16);

-    *q13s16 = vcombine_s16(d19s16, d27s16);

-    *q14s16 = vcombine_s16(d21s16, d29s16);

-    *q15s16 = vcombine_s16(d23s16, d31s16);

-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),

-                        vreinterpretq_s32_s16(*q10s16));

-    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),

-                        vreinterpretq_s32_s16(*q11s16));

-    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),

-                        vreinterpretq_s32_s16(*q14s16));

-    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),

-                        vreinterpretq_s32_s16(*q15s16));

-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8

-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9

-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10

-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11

-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12

-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13

-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14

-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15

-    *q8s16  = q0x2s16.val[0];

-    *q9s16  = q0x2s16.val[1];

-    *q10s16 = q1x2s16.val[0];

-    *q11s16 = q1x2s16.val[1];

-    *q12s16 = q2x2s16.val[0];

-    *q13s16 = q2x2s16.val[1];

-    *q14s16 = q3x2s16.val[0];

-    *q15s16 = q3x2s16.val[1];

-    return;

-}

-void vp9_idct16x16_256_add_neon_pass1(

-        int16_t *in,

-        int16_t *out,

-        int output_stride) {

-    int16x4_t d0s16, d1s16, d2s16, d3s16;

-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

-    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;

-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;

-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;

-    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;

-    int16x8x2_t q0x2s16;

-    q0x2s16 = vld2q_s16(in);

-    q8s16  = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q9s16  = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q10s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q11s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q12s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q13s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q14s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q15s16 = q0x2s16.val[0];

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    d16s16 = vget_low_s16(q8s16);

-    d17s16 = vget_high_s16(q8s16);

-    d18s16 = vget_low_s16(q9s16);

-    d19s16 = vget_high_s16(q9s16);

-    d20s16 = vget_low_s16(q10s16);

-    d21s16 = vget_high_s16(q10s16);

-    d22s16 = vget_low_s16(q11s16);

-    d23s16 = vget_high_s16(q11s16);

-    d24s16 = vget_low_s16(q12s16);

-    d25s16 = vget_high_s16(q12s16);

-    d26s16 = vget_low_s16(q13s16);

-    d27s16 = vget_high_s16(q13s16);

-    d28s16 = vget_low_s16(q14s16);

-    d29s16 = vget_high_s16(q14s16);

-    d30s16 = vget_low_s16(q15s16);

-    d31s16 = vget_high_s16(q15s16);

-    // stage 3

-    d0s16 = vdup_n_s16(cospi_28_64);

-    d1s16 = vdup_n_s16(cospi_4_64);

-    q2s32 = vmull_s16(d18s16, d0s16);

-    q3s32 = vmull_s16(d19s16, d0s16);

-    q5s32 = vmull_s16(d18s16, d1s16);

-    q6s32 = vmull_s16(d19s16, d1s16);

-    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);

-    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);

-    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);

-    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);

-    d2s16 = vdup_n_s16(cospi_12_64);

-    d3s16 = vdup_n_s16(cospi_20_64);

-    d8s16 = vqrshrn_n_s32(q2s32, 14);

-    d9s16 = vqrshrn_n_s32(q3s32, 14);

-    d14s16 = vqrshrn_n_s32(q5s32, 14);

-    d15s16 = vqrshrn_n_s32(q6s32, 14);

-    q4s16 = vcombine_s16(d8s16, d9s16);

-    q7s16 = vcombine_s16(d14s16, d15s16);

-    q2s32 = vmull_s16(d26s16, d2s16);

-    q3s32 = vmull_s16(d27s16, d2s16);

-    q9s32 = vmull_s16(d26s16, d3s16);

-    q15s32 = vmull_s16(d27s16, d3s16);

-    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);

-    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);

-    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);

-    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);

-    d10s16 = vqrshrn_n_s32(q2s32, 14);

-    d11s16 = vqrshrn_n_s32(q3s32, 14);

-    d12s16 = vqrshrn_n_s32(q9s32, 14);

-    d13s16 = vqrshrn_n_s32(q15s32, 14);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    // stage 4

-    d30s16 = vdup_n_s16(cospi_16_64);

-    q2s32 = vmull_s16(d16s16, d30s16);

-    q11s32 = vmull_s16(d17s16, d30s16);

-    q0s32 = vmull_s16(d24s16, d30s16);

-    q1s32 = vmull_s16(d25s16, d30s16);

-    d30s16 = vdup_n_s16(cospi_24_64);

-    d31s16 = vdup_n_s16(cospi_8_64);

-    q3s32 = vaddq_s32(q2s32, q0s32);

-    q12s32 = vaddq_s32(q11s32, q1s32);

-    q13s32 = vsubq_s32(q2s32, q0s32);

-    q1s32 = vsubq_s32(q11s32, q1s32);

-    d16s16 = vqrshrn_n_s32(q3s32, 14);

-    d17s16 = vqrshrn_n_s32(q12s32, 14);

-    d18s16 = vqrshrn_n_s32(q13s32, 14);

-    d19s16 = vqrshrn_n_s32(q1s32, 14);

-    q8s16 = vcombine_s16(d16s16, d17s16);

-    q9s16 = vcombine_s16(d18s16, d19s16);

-    q0s32 = vmull_s16(d20s16, d31s16);

-    q1s32 = vmull_s16(d21s16, d31s16);

-    q12s32 = vmull_s16(d20s16, d30s16);

-    q13s32 = vmull_s16(d21s16, d30s16);

-    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);

-    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);

-    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);

-    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);

-    d22s16 = vqrshrn_n_s32(q0s32, 14);

-    d23s16 = vqrshrn_n_s32(q1s32, 14);

-    d20s16 = vqrshrn_n_s32(q12s32, 14);

-    d21s16 = vqrshrn_n_s32(q13s32, 14);

-    q10s16 = vcombine_s16(d20s16, d21s16);

-    q11s16 = vcombine_s16(d22s16, d23s16);

-    q13s16 = vsubq_s16(q4s16, q5s16);

-    q4s16 = vaddq_s16(q4s16, q5s16);

-    q14s16 = vsubq_s16(q7s16, q6s16);

-    q15s16 = vaddq_s16(q6s16, q7s16);

-    d26s16 = vget_low_s16(q13s16);

-    d27s16 = vget_high_s16(q13s16);

-    d28s16 = vget_low_s16(q14s16);

-    d29s16 = vget_high_s16(q14s16);

-    // stage 5

-    q0s16 = vaddq_s16(q8s16, q11s16);

-    q1s16 = vaddq_s16(q9s16, q10s16);

-    q2s16 = vsubq_s16(q9s16, q10s16);

-    q3s16 = vsubq_s16(q8s16, q11s16);

-    d16s16 = vdup_n_s16(cospi_16_64);

-    q11s32 = vmull_s16(d26s16, d16s16);

-    q12s32 = vmull_s16(d27s16, d16s16);

-    q9s32 = vmull_s16(d28s16, d16s16);

-    q10s32 = vmull_s16(d29s16, d16s16);

-    q6s32 = vsubq_s32(q9s32, q11s32);

-    q13s32 = vsubq_s32(q10s32, q12s32);

-    q9s32 = vaddq_s32(q9s32, q11s32);

-    q10s32 = vaddq_s32(q10s32, q12s32);

-    d10s16 = vqrshrn_n_s32(q6s32, 14);

-    d11s16 = vqrshrn_n_s32(q13s32, 14);

-    d12s16 = vqrshrn_n_s32(q9s32, 14);

-    d13s16 = vqrshrn_n_s32(q10s32, 14);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    // stage 6

-    q8s16 = vaddq_s16(q0s16, q15s16);

-    q9s16 = vaddq_s16(q1s16, q6s16);

-    q10s16 = vaddq_s16(q2s16, q5s16);

-    q11s16 = vaddq_s16(q3s16, q4s16);

-    q12s16 = vsubq_s16(q3s16, q4s16);

-    q13s16 = vsubq_s16(q2s16, q5s16);

-    q14s16 = vsubq_s16(q1s16, q6s16);

-    q15s16 = vsubq_s16(q0s16, q15s16);

-    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));

-    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));

-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));

-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));

-    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));

-    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));

-    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));

-    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));

-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));

-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));

-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));

-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));

-    // store the data

-    output_stride >>= 1;  // output_stride / 2, out is int16_t

-    vst1_u64((uint64_t *)out, d16u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d17u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d18u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d19u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d20u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d21u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d22u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d23u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d24u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d25u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d26u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d27u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d28u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d29u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d30u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d31u64);

-    return;

-}

-void vp9_idct16x16_256_add_neon_pass2(

-        int16_t *src,

-        int16_t *out,

-        int16_t *pass1Output,

-        int16_t skip_adding,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8_t *d;

-    uint8x8_t d12u8, d13u8;

-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;

-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

-    uint64x1_t d24u64, d25u64, d26u64, d27u64;

-    int64x1_t d12s64, d13s64;

-    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;

-    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;

-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;

-    int32x4_t q10s32, q11s32, q12s32, q13s32;

-    int16x8x2_t q0x2s16;

-    q0x2s16 = vld2q_s16(src);

-    q8s16  = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q9s16  = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q10s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q11s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q12s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q13s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q14s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q15s16 = q0x2s16.val[0];

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    d16s16 = vget_low_s16(q8s16);

-    d17s16 = vget_high_s16(q8s16);

-    d18s16 = vget_low_s16(q9s16);

-    d19s16 = vget_high_s16(q9s16);

-    d20s16 = vget_low_s16(q10s16);

-    d21s16 = vget_high_s16(q10s16);

-    d22s16 = vget_low_s16(q11s16);

-    d23s16 = vget_high_s16(q11s16);

-    d24s16 = vget_low_s16(q12s16);

-    d25s16 = vget_high_s16(q12s16);

-    d26s16 = vget_low_s16(q13s16);

-    d27s16 = vget_high_s16(q13s16);

-    d28s16 = vget_low_s16(q14s16);

-    d29s16 = vget_high_s16(q14s16);

-    d30s16 = vget_low_s16(q15s16);

-    d31s16 = vget_high_s16(q15s16);

-    // stage 3

-    d12s16 = vdup_n_s16(cospi_30_64);

-    d13s16 = vdup_n_s16(cospi_2_64);

-    q2s32 = vmull_s16(d16s16, d12s16);

-    q3s32 = vmull_s16(d17s16, d12s16);

-    q1s32 = vmull_s16(d16s16, d13s16);

-    q4s32 = vmull_s16(d17s16, d13s16);

-    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);

-    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);

-    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);

-    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);

-    d0s16 = vqrshrn_n_s32(q2s32, 14);

-    d1s16 = vqrshrn_n_s32(q3s32, 14);

-    d14s16 = vqrshrn_n_s32(q1s32, 14);

-    d15s16 = vqrshrn_n_s32(q4s32, 14);

-    q0s16 = vcombine_s16(d0s16, d1s16);

-    q7s16 = vcombine_s16(d14s16, d15s16);

-    d30s16 = vdup_n_s16(cospi_14_64);

-    d31s16 = vdup_n_s16(cospi_18_64);

-    q2s32 = vmull_s16(d24s16, d30s16);

-    q3s32 = vmull_s16(d25s16, d30s16);

-    q4s32 = vmull_s16(d24s16, d31s16);

-    q5s32 = vmull_s16(d25s16, d31s16);

-    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);

-    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);

-    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);

-    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);

-    d2s16 = vqrshrn_n_s32(q2s32, 14);

-    d3s16 = vqrshrn_n_s32(q3s32, 14);

-    d12s16 = vqrshrn_n_s32(q4s32, 14);

-    d13s16 = vqrshrn_n_s32(q5s32, 14);

-    q1s16 = vcombine_s16(d2s16, d3s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    d30s16 = vdup_n_s16(cospi_22_64);

-    d31s16 = vdup_n_s16(cospi_10_64);

-    q11s32 = vmull_s16(d20s16, d30s16);

-    q12s32 = vmull_s16(d21s16, d30s16);

-    q4s32 = vmull_s16(d20s16, d31s16);

-    q5s32 = vmull_s16(d21s16, d31s16);

-    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);

-    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);

-    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);

-    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);

-    d4s16 = vqrshrn_n_s32(q11s32, 14);

-    d5s16 = vqrshrn_n_s32(q12s32, 14);

-    d11s16 = vqrshrn_n_s32(q5s32, 14);

-    d10s16 = vqrshrn_n_s32(q4s32, 14);

-    q2s16 = vcombine_s16(d4s16, d5s16);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    d30s16 = vdup_n_s16(cospi_6_64);

-    d31s16 = vdup_n_s16(cospi_26_64);

-    q10s32 = vmull_s16(d28s16, d30s16);

-    q11s32 = vmull_s16(d29s16, d30s16);

-    q12s32 = vmull_s16(d28s16, d31s16);

-    q13s32 = vmull_s16(d29s16, d31s16);

-    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);

-    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);

-    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);

-    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);

-    d6s16 = vqrshrn_n_s32(q10s32, 14);

-    d7s16 = vqrshrn_n_s32(q11s32, 14);

-    d8s16 = vqrshrn_n_s32(q12s32, 14);

-    d9s16 = vqrshrn_n_s32(q13s32, 14);

-    q3s16 = vcombine_s16(d6s16, d7s16);

-    q4s16 = vcombine_s16(d8s16, d9s16);

-    // stage 3

-    q9s16  = vsubq_s16(q0s16, q1s16);

-    q0s16  = vaddq_s16(q0s16, q1s16);

-    q10s16 = vsubq_s16(q3s16, q2s16);

-    q11s16 = vaddq_s16(q2s16, q3s16);

-    q12s16 = vaddq_s16(q4s16, q5s16);

-    q13s16 = vsubq_s16(q4s16, q5s16);

-    q14s16 = vsubq_s16(q7s16, q6s16);

-    q7s16  = vaddq_s16(q6s16, q7s16);

-    // stage 4

-    d18s16 = vget_low_s16(q9s16);

-    d19s16 = vget_high_s16(q9s16);

-    d20s16 = vget_low_s16(q10s16);

-    d21s16 = vget_high_s16(q10s16);

-    d26s16 = vget_low_s16(q13s16);

-    d27s16 = vget_high_s16(q13s16);

-    d28s16 = vget_low_s16(q14s16);

-    d29s16 = vget_high_s16(q14s16);

-    d30s16 = vdup_n_s16(cospi_8_64);

-    d31s16 = vdup_n_s16(cospi_24_64);

-    q2s32 = vmull_s16(d18s16, d31s16);

-    q3s32 = vmull_s16(d19s16, d31s16);

-    q4s32 = vmull_s16(d28s16, d31s16);

-    q5s32 = vmull_s16(d29s16, d31s16);

-    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);

-    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);

-    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);

-    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);

-    d12s16 = vqrshrn_n_s32(q2s32, 14);

-    d13s16 = vqrshrn_n_s32(q3s32, 14);

-    d2s16 = vqrshrn_n_s32(q4s32, 14);

-    d3s16 = vqrshrn_n_s32(q5s32, 14);

-    q1s16 = vcombine_s16(d2s16, d3s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    q3s16 = q11s16;

-    q4s16 = q12s16;

-    d30s16 = vdup_n_s16(-cospi_8_64);

-    q11s32 = vmull_s16(d26s16, d30s16);

-    q12s32 = vmull_s16(d27s16, d30s16);

-    q8s32 = vmull_s16(d20s16, d30s16);

-    q9s32 = vmull_s16(d21s16, d30s16);

-    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);

-    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);

-    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);

-    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);

-    d4s16 = vqrshrn_n_s32(q11s32, 14);

-    d5s16 = vqrshrn_n_s32(q12s32, 14);

-    d10s16 = vqrshrn_n_s32(q8s32, 14);

-    d11s16 = vqrshrn_n_s32(q9s32, 14);

-    q2s16 = vcombine_s16(d4s16, d5s16);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    // stage 5

-    q8s16  = vaddq_s16(q0s16, q3s16);

-    q9s16  = vaddq_s16(q1s16, q2s16);

-    q10s16 = vsubq_s16(q1s16, q2s16);

-    q11s16 = vsubq_s16(q0s16, q3s16);

-    q12s16 = vsubq_s16(q7s16, q4s16);

-    q13s16 = vsubq_s16(q6s16, q5s16);

-    q14s16 = vaddq_s16(q6s16, q5s16);

-    q15s16 = vaddq_s16(q7s16, q4s16);

-    // stage 6

-    d20s16 = vget_low_s16(q10s16);

-    d21s16 = vget_high_s16(q10s16);

-    d22s16 = vget_low_s16(q11s16);

-    d23s16 = vget_high_s16(q11s16);

-    d24s16 = vget_low_s16(q12s16);

-    d25s16 = vget_high_s16(q12s16);

-    d26s16 = vget_low_s16(q13s16);

-    d27s16 = vget_high_s16(q13s16);

-    d14s16 = vdup_n_s16(cospi_16_64);

-    q3s32 = vmull_s16(d26s16, d14s16);

-    q4s32 = vmull_s16(d27s16, d14s16);

-    q0s32 = vmull_s16(d20s16, d14s16);

-    q1s32 = vmull_s16(d21s16, d14s16);

-    q5s32 = vsubq_s32(q3s32, q0s32);

-    q6s32 = vsubq_s32(q4s32, q1s32);

-    q10s32 = vaddq_s32(q3s32, q0s32);

-    q4s32 = vaddq_s32(q4s32, q1s32);

-    d4s16 = vqrshrn_n_s32(q5s32, 14);

-    d5s16 = vqrshrn_n_s32(q6s32, 14);

-    d10s16 = vqrshrn_n_s32(q10s32, 14);

-    d11s16 = vqrshrn_n_s32(q4s32, 14);

-    q2s16 = vcombine_s16(d4s16, d5s16);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q0s32 = vmull_s16(d22s16, d14s16);

-    q1s32 = vmull_s16(d23s16, d14s16);

-    q13s32 = vmull_s16(d24s16, d14s16);

-    q6s32 = vmull_s16(d25s16, d14s16);

-    q10s32 = vsubq_s32(q13s32, q0s32);

-    q4s32 = vsubq_s32(q6s32, q1s32);

-    q13s32 = vaddq_s32(q13s32, q0s32);

-    q6s32 = vaddq_s32(q6s32, q1s32);

-    d6s16 = vqrshrn_n_s32(q10s32, 14);

-    d7s16 = vqrshrn_n_s32(q4s32, 14);

-    d8s16 = vqrshrn_n_s32(q13s32, 14);

-    d9s16 = vqrshrn_n_s32(q6s32, 14);

-    q3s16 = vcombine_s16(d6s16, d7s16);

-    q4s16 = vcombine_s16(d8s16, d9s16);

-    // stage 7

-    if (skip_adding != 0) {

-        d = dest;

-        // load the data in pass1

-        q0s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q1s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        d13s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q12s16 = vaddq_s16(q0s16, q15s16);

-        q13s16 = vaddq_s16(q1s16, q14s16);

-        q12s16 = vrshrq_n_s16(q12s16, 6);

-        q13s16 = vrshrq_n_s16(q13s16, 6);

-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

-                          vreinterpret_u8_s64(d12s64));

-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

-                          vreinterpret_u8_s64(d13s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

-        d += dest_stride;

-        q14s16 = vsubq_s16(q1s16, q14s16);

-        q15s16 = vsubq_s16(q0s16, q15s16);

-        q10s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q11s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        d13s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q12s16 = vaddq_s16(q10s16, q5s16);

-        q13s16 = vaddq_s16(q11s16, q4s16);

-        q12s16 = vrshrq_n_s16(q12s16, 6);

-        q13s16 = vrshrq_n_s16(q13s16, 6);

-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

-                          vreinterpret_u8_s64(d12s64));

-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

-                          vreinterpret_u8_s64(d13s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

-        d += dest_stride;

-        q4s16 = vsubq_s16(q11s16, q4s16);

-        q5s16 = vsubq_s16(q10s16, q5s16);

-        q0s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q1s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        d13s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q12s16 = vaddq_s16(q0s16, q3s16);

-        q13s16 = vaddq_s16(q1s16, q2s16);

-        q12s16 = vrshrq_n_s16(q12s16, 6);

-        q13s16 = vrshrq_n_s16(q13s16, 6);

-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

-                          vreinterpret_u8_s64(d12s64));

-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

-                          vreinterpret_u8_s64(d13s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

-        d += dest_stride;

-        q2s16 = vsubq_s16(q1s16, q2s16);

-        q3s16 = vsubq_s16(q0s16, q3s16);

-        q10s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q11s16 = vld1q_s16(pass1Output);

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        d13s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q12s16 = vaddq_s16(q10s16, q9s16);

-        q13s16 = vaddq_s16(q11s16, q8s16);

-        q12s16 = vrshrq_n_s16(q12s16, 6);

-        q13s16 = vrshrq_n_s16(q13s16, 6);

-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

-                          vreinterpret_u8_s64(d12s64));

-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

-                          vreinterpret_u8_s64(d13s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

-        d += dest_stride;

-        q8s16 = vsubq_s16(q11s16, q8s16);

-        q9s16 = vsubq_s16(q10s16, q9s16);

-        // store the data  out 8,9,10,11,12,13,14,15

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q8s16 = vrshrq_n_s16(q8s16, 6);

-        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

-                         vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q9s16 = vrshrq_n_s16(q9s16, 6);

-        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

-                          vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q2s16 = vrshrq_n_s16(q2s16, 6);

-        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),

-                          vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q3s16 = vrshrq_n_s16(q3s16, 6);

-        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),

-                         vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q4s16 = vrshrq_n_s16(q4s16, 6);

-        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),

-                         vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q5s16 = vrshrq_n_s16(q5s16, 6);

-        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),

-                         vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        d12s64 = vld1_s64((int64_t *)dest);

-        dest += dest_stride;

-        q14s16 = vrshrq_n_s16(q14s16, 6);

-        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),

-                          vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-        d += dest_stride;

-        d12s64 = vld1_s64((int64_t *)dest);

-        q15s16 = vrshrq_n_s16(q15s16, 6);

-        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),

-                          vreinterpret_u8_s64(d12s64));

-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));

-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

-    } else {  // skip_adding_dest

-        q0s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q1s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q12s16 = vaddq_s16(q0s16, q15s16);

-        q13s16 = vaddq_s16(q1s16, q14s16);

-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-        vst1_u64((uint64_t *)out, d24u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d25u64);

-        out += 12;

-        vst1_u64((uint64_t *)out, d26u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d27u64);

-        out += 12;

-        q14s16 = vsubq_s16(q1s16, q14s16);

-        q15s16 = vsubq_s16(q0s16, q15s16);

-        q10s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q11s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q12s16 = vaddq_s16(q10s16, q5s16);

-        q13s16 = vaddq_s16(q11s16, q4s16);

-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-        vst1_u64((uint64_t *)out, d24u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d25u64);

-        out += 12;

-        vst1_u64((uint64_t *)out, d26u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d27u64);

-        out += 12;

-        q4s16 = vsubq_s16(q11s16, q4s16);

-        q5s16 = vsubq_s16(q10s16, q5s16);

-        q0s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q1s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q12s16 = vaddq_s16(q0s16, q3s16);

-        q13s16 = vaddq_s16(q1s16, q2s16);

-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-        vst1_u64((uint64_t *)out, d24u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d25u64);

-        out += 12;

-        vst1_u64((uint64_t *)out, d26u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d27u64);

-        out += 12;

-        q2s16 = vsubq_s16(q1s16, q2s16);

-        q3s16 = vsubq_s16(q0s16, q3s16);

-        q10s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q11s16 = vld1q_s16(pass1Output);

-        pass1Output += 8;

-        q12s16 = vaddq_s16(q10s16, q9s16);

-        q13s16 = vaddq_s16(q11s16, q8s16);

-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-        vst1_u64((uint64_t *)out, d24u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d25u64);

-        out += 12;

-        vst1_u64((uint64_t *)out, d26u64);

-        out += 4;

-        vst1_u64((uint64_t *)out, d27u64);

-        out += 12;

-        q8s16 = vsubq_s16(q11s16, q8s16);

-        q9s16 = vsubq_s16(q10s16, q9s16);

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));

-        out += 12;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));

-        out += 12;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));

-        out += 12;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));

-        out += 12;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));

-        out += 12;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));

-        out += 12;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));

-        out += 12;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));

-        out += 4;

-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));

-    }

-    return;

-}

-void vp9_idct16x16_10_add_neon_pass1(

-        int16_t *in,

-        int16_t *out,

-        int output_stride) {

-    int16x4_t d4s16;

-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

-    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;

-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;

-    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    int32x4_t q6s32, q9s32;

-    int32x4_t q10s32, q11s32, q12s32, q15s32;

-    int16x8x2_t q0x2s16;

-    q0x2s16 = vld2q_s16(in);

-    q8s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q9s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q10s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q11s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q12s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q13s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q14s16 = q0x2s16.val[0];

-    in += 16;

-    q0x2s16 = vld2q_s16(in);

-    q15s16 = q0x2s16.val[0];

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    // stage 3

-    q0s16 = vdupq_n_s16(cospi_28_64 * 2);

-    q1s16 = vdupq_n_s16(cospi_4_64 * 2);

-    q4s16 = vqrdmulhq_s16(q9s16, q0s16);

-    q7s16 = vqrdmulhq_s16(q9s16, q1s16);

-    // stage 4

-    q1s16 = vdupq_n_s16(cospi_16_64 * 2);

-    d4s16 = vdup_n_s16(cospi_16_64);

-    q8s16 = vqrdmulhq_s16(q8s16, q1s16);

-    d8s16 = vget_low_s16(q4s16);

-    d9s16 = vget_high_s16(q4s16);

-    d14s16 = vget_low_s16(q7s16);

-    d15s16 = vget_high_s16(q7s16);

-    q9s32  = vmull_s16(d14s16, d4s16);

-    q10s32 = vmull_s16(d15s16, d4s16);

-    q12s32 = vmull_s16(d9s16, d4s16);

-    q11s32 = vmull_s16(d8s16, d4s16);

-    q15s32 = vsubq_s32(q10s32, q12s32);

-    q6s32 = vsubq_s32(q9s32, q11s32);

-    q9s32 = vaddq_s32(q9s32, q11s32);

-    q10s32 = vaddq_s32(q10s32, q12s32);

-    d11s16 = vqrshrn_n_s32(q15s32, 14);

-    d10s16 = vqrshrn_n_s32(q6s32, 14);

-    d12s16 = vqrshrn_n_s32(q9s32, 14);

-    d13s16 = vqrshrn_n_s32(q10s32, 14);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    // stage 6

-    q2s16 = vaddq_s16(q8s16, q7s16);

-    q9s16 = vaddq_s16(q8s16, q6s16);

-    q10s16 = vaddq_s16(q8s16, q5s16);

-    q11s16 = vaddq_s16(q8s16, q4s16);

-    q12s16 = vsubq_s16(q8s16, q4s16);

-    q13s16 = vsubq_s16(q8s16, q5s16);

-    q14s16 = vsubq_s16(q8s16, q6s16);

-    q15s16 = vsubq_s16(q8s16, q7s16);

-    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));

-    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));

-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));

-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));

-    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));

-    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));

-    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));

-    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));

-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));

-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));

-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));

-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));

-    // store the data

-    output_stride >>= 1;  // output_stride / 2, out is int16_t

-    vst1_u64((uint64_t *)out, d4u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d5u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d18u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d19u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d20u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d21u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d22u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d23u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d24u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d25u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d26u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d27u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d28u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d29u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d30u64);

-    out += output_stride;

-    vst1_u64((uint64_t *)out, d31u64);

-    return;

-}

-void vp9_idct16x16_10_add_neon_pass2(

-        int16_t *src,

-        int16_t *out,

-        int16_t *pass1Output,

-        int16_t skip_adding,

-        uint8_t *dest,

-        int dest_stride) {

-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;

-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

-    int16x4_t d20s16, d21s16, d22s16, d23s16;

-    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;

-    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;

-    uint64x1_t d16u64, d17u64, d18u64, d19u64;

-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;

-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;

-    int32x4_t q10s32, q11s32, q12s32, q13s32;

-    int16x8x2_t q0x2s16;

-    (void)skip_adding;

-    (void)dest;

-    (void)dest_stride;

-    q0x2s16 = vld2q_s16(src);

-    q8s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q9s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q10s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q11s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q12s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q13s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q14s16 = q0x2s16.val[0];

-    src += 16;

-    q0x2s16 = vld2q_s16(src);

-    q15s16 = q0x2s16.val[0];

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    // stage 3

-    q6s16 = vdupq_n_s16(cospi_30_64 * 2);

-    q0s16 = vqrdmulhq_s16(q8s16, q6s16);

-    q6s16 = vdupq_n_s16(cospi_2_64 * 2);

-    q7s16 = vqrdmulhq_s16(q8s16, q6s16);

-    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);

-    q14s16 = vdupq_n_s16(cospi_6_64 * 2);

-    q3s16 = vqrdmulhq_s16(q9s16, q15s16);

-    q4s16 = vqrdmulhq_s16(q9s16, q14s16);

-    // stage 4

-    d0s16 = vget_low_s16(q0s16);

-    d1s16 = vget_high_s16(q0s16);

-    d6s16 = vget_low_s16(q3s16);

-    d7s16 = vget_high_s16(q3s16);

-    d8s16 = vget_low_s16(q4s16);

-    d9s16 = vget_high_s16(q4s16);

-    d14s16 = vget_low_s16(q7s16);

-    d15s16 = vget_high_s16(q7s16);

-    d30s16 = vdup_n_s16(cospi_8_64);

-    d31s16 = vdup_n_s16(cospi_24_64);

-    q12s32 = vmull_s16(d14s16, d31s16);

-    q5s32 = vmull_s16(d15s16, d31s16);

-    q2s32 = vmull_s16(d0s16, d31s16);

-    q11s32 = vmull_s16(d1s16, d31s16);

-    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);

-    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);

-    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);

-    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);

-    d2s16 = vqrshrn_n_s32(q12s32, 14);

-    d3s16 = vqrshrn_n_s32(q5s32, 14);

-    d12s16 = vqrshrn_n_s32(q2s32, 14);

-    d13s16 = vqrshrn_n_s32(q11s32, 14);

-    q1s16 = vcombine_s16(d2s16, d3s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    d30s16 = vdup_n_s16(-cospi_8_64);

-    q10s32 = vmull_s16(d8s16, d30s16);

-    q13s32 = vmull_s16(d9s16, d30s16);

-    q8s32 = vmull_s16(d6s16, d30s16);

-    q9s32 = vmull_s16(d7s16, d30s16);

-    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);

-    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);

-    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);

-    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);

-    d4s16 = vqrshrn_n_s32(q10s32, 14);

-    d5s16 = vqrshrn_n_s32(q13s32, 14);

-    d10s16 = vqrshrn_n_s32(q8s32, 14);

-    d11s16 = vqrshrn_n_s32(q9s32, 14);

-    q2s16 = vcombine_s16(d4s16, d5s16);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    // stage 5

-    q8s16  = vaddq_s16(q0s16, q3s16);

-    q9s16  = vaddq_s16(q1s16, q2s16);

-    q10s16 = vsubq_s16(q1s16, q2s16);

-    q11s16 = vsubq_s16(q0s16, q3s16);

-    q12s16 = vsubq_s16(q7s16, q4s16);

-    q13s16 = vsubq_s16(q6s16, q5s16);

-    q14s16 = vaddq_s16(q6s16, q5s16);

-    q15s16 = vaddq_s16(q7s16, q4s16);

-    // stage 6

-    d20s16 = vget_low_s16(q10s16);

-    d21s16 = vget_high_s16(q10s16);

-    d22s16 = vget_low_s16(q11s16);

-    d23s16 = vget_high_s16(q11s16);

-    d24s16 = vget_low_s16(q12s16);

-    d25s16 = vget_high_s16(q12s16);

-    d26s16 = vget_low_s16(q13s16);

-    d27s16 = vget_high_s16(q13s16);

-    d14s16 = vdup_n_s16(cospi_16_64);

-    q3s32 = vmull_s16(d26s16, d14s16);

-    q4s32 = vmull_s16(d27s16, d14s16);

-    q0s32 = vmull_s16(d20s16, d14s16);

-    q1s32 = vmull_s16(d21s16, d14s16);

-    q5s32 = vsubq_s32(q3s32, q0s32);

-    q6s32 = vsubq_s32(q4s32, q1s32);

-    q0s32 = vaddq_s32(q3s32, q0s32);

-    q4s32 = vaddq_s32(q4s32, q1s32);

-    d4s16 = vqrshrn_n_s32(q5s32, 14);

-    d5s16 = vqrshrn_n_s32(q6s32, 14);

-    d10s16 = vqrshrn_n_s32(q0s32, 14);

-    d11s16 = vqrshrn_n_s32(q4s32, 14);

-    q2s16 = vcombine_s16(d4s16, d5s16);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q0s32 = vmull_s16(d22s16, d14s16);

-    q1s32 = vmull_s16(d23s16, d14s16);

-    q13s32 = vmull_s16(d24s16, d14s16);

-    q6s32 = vmull_s16(d25s16, d14s16);

-    q10s32 = vsubq_s32(q13s32, q0s32);

-    q4s32 = vsubq_s32(q6s32, q1s32);

-    q13s32 = vaddq_s32(q13s32, q0s32);

-    q6s32 = vaddq_s32(q6s32, q1s32);

-    d6s16 = vqrshrn_n_s32(q10s32, 14);

-    d7s16 = vqrshrn_n_s32(q4s32, 14);

-    d8s16 = vqrshrn_n_s32(q13s32, 14);

-    d9s16 = vqrshrn_n_s32(q6s32, 14);

-    q3s16 = vcombine_s16(d6s16, d7s16);

-    q4s16 = vcombine_s16(d8s16, d9s16);

-    // stage 7

-    q0s16 = vld1q_s16(pass1Output);

-    pass1Output += 8;

-    q1s16 = vld1q_s16(pass1Output);

-    pass1Output += 8;

-    q12s16 = vaddq_s16(q0s16, q15s16);

-    q13s16 = vaddq_s16(q1s16, q14s16);

-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-    vst1_u64((uint64_t *)out, d24u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d25u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d26u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d27u64);

-    out += 12;

-    q14s16 = vsubq_s16(q1s16, q14s16);

-    q15s16 = vsubq_s16(q0s16, q15s16);

-    q10s16 = vld1q_s16(pass1Output);

-    pass1Output += 8;

-    q11s16 = vld1q_s16(pass1Output);

-    pass1Output += 8;

-    q12s16 = vaddq_s16(q10s16, q5s16);

-    q13s16 = vaddq_s16(q11s16, q4s16);

-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-    vst1_u64((uint64_t *)out, d24u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d25u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d26u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d27u64);

-    out += 12;

-    q4s16 = vsubq_s16(q11s16, q4s16);

-    q5s16 = vsubq_s16(q10s16, q5s16);

-    q0s16 = vld1q_s16(pass1Output);

-    pass1Output += 8;

-    q1s16 = vld1q_s16(pass1Output);

-    pass1Output += 8;

-    q12s16 = vaddq_s16(q0s16, q3s16);

-    q13s16 = vaddq_s16(q1s16, q2s16);

-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-    vst1_u64((uint64_t *)out, d24u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d25u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d26u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d27u64);

-    out += 12;

-    q2s16 = vsubq_s16(q1s16, q2s16);

-    q3s16 = vsubq_s16(q0s16, q3s16);

-    q10s16 = vld1q_s16(pass1Output);

-    pass1Output += 8;

-    q11s16 = vld1q_s16(pass1Output);

-    q12s16 = vaddq_s16(q10s16, q9s16);

-    q13s16 = vaddq_s16(q11s16, q8s16);

-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

-    vst1_u64((uint64_t *)out, d24u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d25u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d26u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d27u64);

-    out += 12;

-    q8s16 = vsubq_s16(q11s16, q8s16);

-    q9s16 = vsubq_s16(q10s16, q9s16);

-    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));

-    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));

-    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));

-    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));

-    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));

-    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));

-    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));

-    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));

-    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));

-    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));

-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));

-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));

-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));

-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));

-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));

-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));

-    vst1_u64((uint64_t *)out, d16u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d17u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d18u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d19u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d4u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d5u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d6u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d7u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d8u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d9u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d10u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d11u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d28u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d29u64);

-    out += 12;

-    vst1_u64((uint64_t *)out, d30u64);

-    out += 4;

-    vst1_u64((uint64_t *)out, d31u64);

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm

+++ /dev/null

@@ -1,1179 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_idct16x16_256_add_neon_pass1|

-    EXPORT  |vp9_idct16x16_256_add_neon_pass2|

-    EXPORT  |vp9_idct16x16_10_add_neon_pass1|

-    EXPORT  |vp9_idct16x16_10_add_neon_pass2|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.

-    MACRO

-    TRANSPOSE8X8

-    vswp            d17, d24

-    vswp            d23, d30

-    vswp            d21, d28

-    vswp            d19, d26

-    vtrn.32         q8, q10

-    vtrn.32         q9, q11

-    vtrn.32         q12, q14

-    vtrn.32         q13, q15

-    vtrn.16         q8, q9

-    vtrn.16         q10, q11

-    vtrn.16         q12, q13

-    vtrn.16         q14, q15

-    MEND

-    AREA    Block, CODE, READONLY ; name this block of code

-;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,

-;                                          int16_t *output, int output_stride)

-;

-; r0  int16_t input

-; r1  int16_t *output

-; r2  int  output_stride)

-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

-; will be stored back into q8-q15 registers. This function will touch q0-q7

-; registers and use them as buffer during calculation.

-|vp9_idct16x16_256_add_neon_pass1| PROC

-    ; TODO(hkuang): Find a better way to load the elements.

-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

-    vld2.s16        {q8,q9}, [r0]!

-    vld2.s16        {q9,q10}, [r0]!

-    vld2.s16        {q10,q11}, [r0]!

-    vld2.s16        {q11,q12}, [r0]!

-    vld2.s16        {q12,q13}, [r0]!

-    vld2.s16        {q13,q14}, [r0]!

-    vld2.s16        {q14,q15}, [r0]!

-    vld2.s16        {q1,q2}, [r0]!

-    vmov.s16        q15, q1

-    ; generate  cospi_28_64 = 3196

-    mov             r3, #0xc00

-    add             r3, #0x7c

-    ; generate cospi_4_64  = 16069

-    mov             r12, #0x3e00

-    add             r12, #0xc5

-    ; transpose the input data

-    TRANSPOSE8X8

-    ; stage 3

-    vdup.16         d0, r3                    ; duplicate cospi_28_64

-    vdup.16         d1, r12                   ; duplicate cospi_4_64

-    ; preloading to avoid stall

-    ; generate cospi_12_64 = 13623

-    mov             r3, #0x3500

-    add             r3, #0x37

-    ; generate cospi_20_64 = 9102

-    mov             r12, #0x2300

-    add             r12, #0x8e

-    ; step2[4] * cospi_28_64

-    vmull.s16       q2, d18, d0

-    vmull.s16       q3, d19, d0

-    ; step2[4] * cospi_4_64

-    vmull.s16       q5, d18, d1

-    vmull.s16       q6, d19, d1

-    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64

-    vmlsl.s16       q2, d30, d1

-    vmlsl.s16       q3, d31, d1

-    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64

-    vmlal.s16       q5, d30, d0

-    vmlal.s16       q6, d31, d0

-    vdup.16         d2, r3                    ; duplicate cospi_12_64

-    vdup.16         d3, r12                   ; duplicate cospi_20_64

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d8, q2, #14               ; >> 14

-    vqrshrn.s32     d9, q3, #14               ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d14, q5, #14              ; >> 14

-    vqrshrn.s32     d15, q6, #14              ; >> 14

-    ; preloading to avoid stall

-    ; generate cospi_16_64 = 11585

-    mov             r3, #0x2d00

-    add             r3, #0x41

-    ; generate cospi_24_64 = 6270

-    mov             r12, #0x1800

-    add             r12, #0x7e

-    ; step2[5] * cospi_12_64

-    vmull.s16       q2, d26, d2

-    vmull.s16       q3, d27, d2

-    ; step2[5] * cospi_20_64

-    vmull.s16       q9, d26, d3

-    vmull.s16       q15, d27, d3

-    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64

-    vmlsl.s16       q2, d22, d3

-    vmlsl.s16       q3, d23, d3

-    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64

-    vmlal.s16       q9, d22, d2

-    vmlal.s16       q15, d23, d2

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d10, q2, #14              ; >> 14

-    vqrshrn.s32     d11, q3, #14              ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d12, q9, #14              ; >> 14

-    vqrshrn.s32     d13, q15, #14             ; >> 14

-    ; stage 4

-    vdup.16         d30, r3                   ; cospi_16_64

-    ; step1[0] * cospi_16_64

-    vmull.s16       q2, d16, d30

-    vmull.s16       q11, d17, d30

-    ; step1[1] * cospi_16_64

-    vmull.s16       q0, d24, d30

-    vmull.s16       q1, d25, d30

-    ; generate cospi_8_64 = 15137

-    mov             r3, #0x3b00

-    add             r3, #0x21

-    vdup.16         d30, r12                  ; duplicate cospi_24_64

-    vdup.16         d31, r3                   ; duplicate cospi_8_64

-    ; temp1 = (step1[0] + step1[1]) * cospi_16_64

-    vadd.s32        q3, q2, q0

-    vadd.s32        q12, q11, q1

-    ; temp2 = (step1[0] - step1[1]) * cospi_16_64

-    vsub.s32        q13, q2, q0

-    vsub.s32        q1, q11, q1

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d16, q3, #14              ; >> 14

-    vqrshrn.s32     d17, q12, #14             ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d18, q13, #14             ; >> 14

-    vqrshrn.s32     d19, q1, #14              ; >> 14

-    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

-    ; step1[2] * cospi_8_64

-    vmull.s16       q0, d20, d31

-    vmull.s16       q1, d21, d31

-    ; step1[2] * cospi_24_64

-    vmull.s16       q12, d20, d30

-    vmull.s16       q13, d21, d30

-    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64

-    vmlal.s16       q0, d28, d30

-    vmlal.s16       q1, d29, d30

-    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64

-    vmlsl.s16       q12, d28, d31

-    vmlsl.s16       q13, d29, d31

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d22, q0, #14              ; >> 14

-    vqrshrn.s32     d23, q1, #14              ; >> 14

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d20, q12, #14             ; >> 14

-    vqrshrn.s32     d21, q13, #14             ; >> 14

-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];

-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];

-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];

-    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];

-    ; generate cospi_16_64 = 11585

-    mov             r3, #0x2d00

-    add             r3, #0x41

-    ; stage 5

-    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];

-    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];

-    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];

-    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];

-    vdup.16         d16, r3;                  ; duplicate cospi_16_64

-    ; step2[5] * cospi_16_64

-    vmull.s16       q11, d26, d16

-    vmull.s16       q12, d27, d16

-    ; step2[6] * cospi_16_64

-    vmull.s16       q9, d28, d16

-    vmull.s16       q10, d29, d16

-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64

-    vsub.s32        q6, q9, q11

-    vsub.s32        q13, q10, q12

-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64

-    vadd.s32        q9, q9, q11

-    vadd.s32        q10, q10, q12

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d10, q6, #14              ; >> 14

-    vqrshrn.s32     d11, q13, #14             ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d12, q9, #14              ; >> 14

-    vqrshrn.s32     d13, q10, #14             ; >> 14

-    ; stage 6

-    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];

-    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];

-    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];

-    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];

-    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];

-    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];

-    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];

-    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];

-    ; store the data

-    vst1.64         {d16}, [r1], r2

-    vst1.64         {d17}, [r1], r2

-    vst1.64         {d18}, [r1], r2

-    vst1.64         {d19}, [r1], r2

-    vst1.64         {d20}, [r1], r2

-    vst1.64         {d21}, [r1], r2

-    vst1.64         {d22}, [r1], r2

-    vst1.64         {d23}, [r1], r2

-    vst1.64         {d24}, [r1], r2

-    vst1.64         {d25}, [r1], r2

-    vst1.64         {d26}, [r1], r2

-    vst1.64         {d27}, [r1], r2

-    vst1.64         {d28}, [r1], r2

-    vst1.64         {d29}, [r1], r2

-    vst1.64         {d30}, [r1], r2

-    vst1.64         {d31}, [r1], r2

-    bx              lr

-    ENDP  ; |vp9_idct16x16_256_add_neon_pass1|

-;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,

-;                                        int16_t *output,

-;                                        int16_t *pass1Output,

-;                                        int16_t skip_adding,

-;                                        uint8_t *dest,

-;                                        int dest_stride)

-;

-; r0  int16_t *src

-; r1  int16_t *output,

-; r2  int16_t *pass1Output,

-; r3  int16_t skip_adding,

-; r4  uint8_t *dest,

-; r5  int dest_stride)

-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

-; will be stored back into q8-q15 registers. This function will touch q0-q7

-; registers and use them as buffer during calculation.

-|vp9_idct16x16_256_add_neon_pass2| PROC

-    push            {r3-r9}

-    ; TODO(hkuang): Find a better way to load the elements.

-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15

-    vld2.s16        {q8,q9}, [r0]!

-    vld2.s16        {q9,q10}, [r0]!

-    vld2.s16        {q10,q11}, [r0]!

-    vld2.s16        {q11,q12}, [r0]!

-    vld2.s16        {q12,q13}, [r0]!

-    vld2.s16        {q13,q14}, [r0]!

-    vld2.s16        {q14,q15}, [r0]!

-    vld2.s16        {q0,q1}, [r0]!

-    vmov.s16        q15, q0;

-    ; generate  cospi_30_64 = 1606

-    mov             r3, #0x0600

-    add             r3, #0x46

-    ; generate cospi_2_64  = 16305

-    mov             r12, #0x3f00

-    add             r12, #0xb1

-    ; transpose the input data

-    TRANSPOSE8X8

-    ; stage 3

-    vdup.16         d12, r3                   ; duplicate cospi_30_64

-    vdup.16         d13, r12                  ; duplicate cospi_2_64

-    ; preloading to avoid stall

-    ; generate cospi_14_64 = 12665

-    mov             r3, #0x3100

-    add             r3, #0x79

-    ; generate cospi_18_64 = 10394

-    mov             r12, #0x2800

-    add             r12, #0x9a

-    ; step1[8] * cospi_30_64

-    vmull.s16       q2, d16, d12

-    vmull.s16       q3, d17, d12

-    ; step1[8] * cospi_2_64

-    vmull.s16       q1, d16, d13

-    vmull.s16       q4, d17, d13

-    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64

-    vmlsl.s16       q2, d30, d13

-    vmlsl.s16       q3, d31, d13

-    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64

-    vmlal.s16       q1, d30, d12

-    vmlal.s16       q4, d31, d12

-    vdup.16         d30, r3                   ; duplicate cospi_14_64

-    vdup.16         d31, r12                  ; duplicate cospi_18_64

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d0, q2, #14               ; >> 14

-    vqrshrn.s32     d1, q3, #14               ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d14, q1, #14              ; >> 14

-    vqrshrn.s32     d15, q4, #14              ; >> 14

-    ; preloading to avoid stall

-    ; generate cospi_22_64 = 7723

-    mov             r3, #0x1e00

-    add             r3, #0x2b

-    ; generate cospi_10_64 = 14449

-    mov             r12, #0x3800

-    add             r12, #0x71

-    ; step1[9] * cospi_14_64

-    vmull.s16       q2, d24, d30

-    vmull.s16       q3, d25, d30

-    ; step1[9] * cospi_18_64

-    vmull.s16       q4, d24, d31

-    vmull.s16       q5, d25, d31

-    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64

-    vmlsl.s16       q2, d22, d31

-    vmlsl.s16       q3, d23, d31

-    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64

-    vmlal.s16       q4, d22, d30

-    vmlal.s16       q5, d23, d30

-    vdup.16         d30, r3                   ; duplicate cospi_22_64

-    vdup.16         d31, r12                  ; duplicate cospi_10_64

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d2, q2, #14               ; >> 14

-    vqrshrn.s32     d3, q3, #14               ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d12, q4, #14              ; >> 14

-    vqrshrn.s32     d13, q5, #14              ; >> 14

-    ; step1[10] * cospi_22_64

-    vmull.s16       q11, d20, d30

-    vmull.s16       q12, d21, d30

-    ; step1[10] * cospi_10_64

-    vmull.s16       q4, d20, d31

-    vmull.s16       q5, d21, d31

-    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64

-    vmlsl.s16       q11, d26, d31

-    vmlsl.s16       q12, d27, d31

-    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64

-    vmlal.s16       q4, d26, d30

-    vmlal.s16       q5, d27, d30

-    ; preloading to avoid stall

-    ; generate cospi_6_64 = 15679

-    mov             r3, #0x3d00

-    add             r3, #0x3f

-    ; generate cospi_26_64 = 4756

-    mov             r12, #0x1200

-    add             r12, #0x94

-    vdup.16         d30, r3                   ; duplicate cospi_6_64

-    vdup.16         d31, r12                  ; duplicate cospi_26_64

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d4, q11, #14              ; >> 14

-    vqrshrn.s32     d5, q12, #14              ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d11, q5, #14              ; >> 14

-    vqrshrn.s32     d10, q4, #14              ; >> 14

-    ; step1[11] * cospi_6_64

-    vmull.s16       q10, d28, d30

-    vmull.s16       q11, d29, d30

-    ; step1[11] * cospi_26_64

-    vmull.s16       q12, d28, d31

-    vmull.s16       q13, d29, d31

-    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64

-    vmlsl.s16       q10, d18, d31

-    vmlsl.s16       q11, d19, d31

-    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64

-    vmlal.s16       q12, d18, d30

-    vmlal.s16       q13, d19, d30

-    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]

-    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d6, q10, #14              ; >> 14

-    vqrshrn.s32     d7, q11, #14              ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d8, q12, #14              ; >> 14

-    vqrshrn.s32     d9, q13, #14              ; >> 14

-    ; stage 3

-    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]

-    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]

-    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]

-    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]

-    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]

-    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]

-    ; stage 4

-    ; generate cospi_24_64 = 6270

-    mov             r3, #0x1800

-    add             r3, #0x7e

-    ; generate cospi_8_64 = 15137

-    mov             r12, #0x3b00

-    add             r12, #0x21

-    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64

-    vdup.16         d30, r12                  ; duplicate cospi_8_64

-    vdup.16         d31, r3                   ; duplicate cospi_24_64

-    ; step1[9] * cospi_24_64

-    vmull.s16       q2, d18, d31

-    vmull.s16       q3, d19, d31

-    ; step1[14] * cospi_24_64

-    vmull.s16       q4, d28, d31

-    vmull.s16       q5, d29, d31

-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64

-    vmlal.s16       q2, d28, d30

-    vmlal.s16       q3, d29, d30

-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64

-    vmlsl.s16       q4, d18, d30

-    vmlsl.s16       q5, d19, d30

-    rsb             r12, #0

-    vdup.16         d30, r12                  ; duplicate -cospi_8_64

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d12, q2, #14              ; >> 14

-    vqrshrn.s32     d13, q3, #14              ; >> 14

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d2, q4, #14               ; >> 14

-    vqrshrn.s32     d3, q5, #14               ; >> 14

-    vmov.s16        q3, q11

-    vmov.s16        q4, q12

-    ; - step1[13] * cospi_8_64

-    vmull.s16       q11, d26, d30

-    vmull.s16       q12, d27, d30

-    ; -step1[10] * cospi_8_64

-    vmull.s16       q8, d20, d30

-    vmull.s16       q9, d21, d30

-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64

-    vmlsl.s16       q11, d20, d31

-    vmlsl.s16       q12, d21, d31

-    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64

-    vmlal.s16       q8, d26, d31

-    vmlal.s16       q9, d27, d31

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d4, q11, #14              ; >> 14

-    vqrshrn.s32     d5, q12, #14              ; >> 14

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d10, q8, #14              ; >> 14

-    vqrshrn.s32     d11, q9, #14              ; >> 14

-    ; stage 5

-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];

-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];

-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];

-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];

-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];

-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];

-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];

-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];

-    ; stage 6.

-    ; generate cospi_16_64 = 11585

-    mov             r12, #0x2d00

-    add             r12, #0x41

-    vdup.16         d14, r12                  ; duplicate cospi_16_64

-    ; step1[13] * cospi_16_64

-    vmull.s16       q3, d26, d14

-    vmull.s16       q4, d27, d14

-    ; step1[10] * cospi_16_64

-    vmull.s16       q0, d20, d14

-    vmull.s16       q1, d21, d14

-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64

-    vsub.s32        q5, q3, q0

-    vsub.s32        q6, q4, q1

-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64

-    vadd.s32        q10, q3, q0

-    vadd.s32        q4, q4, q1

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d4, q5, #14               ; >> 14

-    vqrshrn.s32     d5, q6, #14               ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d10, q10, #14             ; >> 14

-    vqrshrn.s32     d11, q4, #14              ; >> 14

-    ; step1[11] * cospi_16_64

-    vmull.s16       q0, d22, d14

-    vmull.s16       q1, d23, d14

-    ; step1[12] * cospi_16_64

-    vmull.s16       q13, d24, d14

-    vmull.s16       q6, d25, d14

-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64

-    vsub.s32        q10, q13, q0

-    vsub.s32        q4, q6, q1

-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64

-    vadd.s32        q13, q13, q0

-    vadd.s32        q6, q6, q1

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d6, q10, #14              ; >> 14

-    vqrshrn.s32     d7, q4, #14               ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d8, q13, #14              ; >> 14

-    vqrshrn.s32     d9, q6, #14               ; >> 14

-    mov              r4, #16                  ; pass1Output stride

-    ldr              r3, [sp]                 ; load skip_adding

-    cmp              r3, #0                   ; check if need adding dest data

-    beq              skip_adding_dest

-    ldr              r7, [sp, #28]            ; dest used to save element 0-7

-    mov              r9, r7                   ; save dest pointer for later use

-    ldr              r8, [sp, #32]            ; load dest_stride

-    ; stage 7

-    ; load the data in pass1

-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]

-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]

-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]

-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]

-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]

-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q12                  ; clip pixel

-    vqmovun.s16     d13, q13                  ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vst1.64         {d13}, [r9], r8           ; store the data

-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]

-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]

-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]

-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q12                  ; clip pixel

-    vqmovun.s16     d13, q13                  ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vst1.64         {d13}, [r9], r8           ; store the data

-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]

-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]

-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]

-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]

-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]

-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]

-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]

-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q12                  ; clip pixel

-    vqmovun.s16     d13, q13                  ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vst1.64         {d13}, [r9], r8           ; store the data

-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]

-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]

-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]

-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q12                  ; clip pixel

-    vqmovun.s16     d13, q13                  ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vst1.64         {d13}, [r9], r8           ; store the data

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]

-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]

-    ; store the data  output 8,9,10,11,12,13,14,15

-    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO

-    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q8                   ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vrshr.s16       q9, q9, #6

-    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]

-    vqmovun.s16     d13, q9                   ; clip pixel

-    vst1.64         {d13}, [r9], r8           ; store the data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vrshr.s16       q2, q2, #6

-    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q2                   ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vrshr.s16       q3, q3, #6

-    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]

-    vqmovun.s16     d13, q3                   ; clip pixel

-    vst1.64         {d13}, [r9], r8           ; store the data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vrshr.s16       q4, q4, #6

-    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q4                   ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vrshr.s16       q5, q5, #6

-    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]

-    vqmovun.s16     d13, q5                   ; clip pixel

-    vst1.64         {d13}, [r9], r8           ; store the data

-    vld1.64         {d13}, [r7], r8           ; load destinatoin data

-    vrshr.s16       q14, q14, #6

-    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]

-    vqmovun.s16     d12, q14                  ; clip pixel

-    vst1.64         {d12}, [r9], r8           ; store the data

-    vld1.64         {d12}, [r7], r8           ; load destinatoin data

-    vrshr.s16       q15, q15, #6

-    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]

-    vqmovun.s16     d13, q15                  ; clip pixel

-    vst1.64         {d13}, [r9], r8           ; store the data

-    b               end_idct16x16_pass2

-skip_adding_dest

-    ; stage 7

-    ; load the data in pass1

-    mov              r5, #24

-    mov              r3, #8

-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]

-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]

-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]

-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]

-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]

-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]

-    vst1.64         {d24}, [r1], r3           ; store output[0]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[1]

-    vst1.64         {d27}, [r1], r5

-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]

-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]

-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]

-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]

-    vst1.64         {d24}, [r1], r3           ; store output[2]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[3]

-    vst1.64         {d27}, [r1], r5

-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]

-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]

-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]

-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]

-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]

-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]

-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]

-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]

-    vst1.64         {d24}, [r1], r3           ; store output[4]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[5]

-    vst1.64         {d27}, [r1], r5

-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]

-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]

-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]

-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]

-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]

-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]

-    vst1.64         {d24}, [r1], r3           ; store output[6]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[7]

-    vst1.64         {d27}, [r1], r5

-    ; store the data  output 8,9,10,11,12,13,14,15

-    vst1.64         {d16}, [r1], r3

-    vst1.64         {d17}, [r1], r5

-    vst1.64         {d18}, [r1], r3

-    vst1.64         {d19}, [r1], r5

-    vst1.64         {d4}, [r1], r3

-    vst1.64         {d5}, [r1], r5

-    vst1.64         {d6}, [r1], r3

-    vst1.64         {d7}, [r1], r5

-    vst1.64         {d8}, [r1], r3

-    vst1.64         {d9}, [r1], r5

-    vst1.64         {d10}, [r1], r3

-    vst1.64         {d11}, [r1], r5

-    vst1.64         {d28}, [r1], r3

-    vst1.64         {d29}, [r1], r5

-    vst1.64         {d30}, [r1], r3

-    vst1.64         {d31}, [r1], r5

-end_idct16x16_pass2

-    pop             {r3-r9}

-    bx              lr

-    ENDP  ; |vp9_idct16x16_256_add_neon_pass2|

-;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,

-;                                             int16_t *output, int output_stride)

-;

-; r0  int16_t input

-; r1  int16_t *output

-; r2  int  output_stride)

-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

-; will be stored back into q8-q15 registers. This function will touch q0-q7

-; registers and use them as buffer during calculation.

-|vp9_idct16x16_10_add_neon_pass1| PROC

-    ; TODO(hkuang): Find a better way to load the elements.

-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

-    vld2.s16        {q8,q9}, [r0]!

-    vld2.s16        {q9,q10}, [r0]!

-    vld2.s16        {q10,q11}, [r0]!

-    vld2.s16        {q11,q12}, [r0]!

-    vld2.s16        {q12,q13}, [r0]!

-    vld2.s16        {q13,q14}, [r0]!

-    vld2.s16        {q14,q15}, [r0]!

-    vld2.s16        {q1,q2}, [r0]!

-    vmov.s16        q15, q1

-    ; generate  cospi_28_64*2 = 6392

-    mov             r3, #0x1800

-    add             r3, #0xf8

-    ; generate cospi_4_64*2  = 32138

-    mov             r12, #0x7d00

-    add             r12, #0x8a

-    ; transpose the input data

-    TRANSPOSE8X8

-    ; stage 3

-    vdup.16         q0, r3                    ; duplicate cospi_28_64*2

-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2

-    ; The following instructions use vqrdmulh to do the

-    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,

-    ; double, and return the high 16 bits, effectively giving >> 15. Doubling

-    ; the constant will change this to >> 14.

-    ; dct_const_round_shift(step2[4] * cospi_28_64);

-    vqrdmulh.s16    q4, q9, q0

-    ; preloading to avoid stall

-    ; generate cospi_16_64*2 = 23170

-    mov             r3, #0x5a00

-    add             r3, #0x82

-    ; dct_const_round_shift(step2[4] * cospi_4_64);

-    vqrdmulh.s16    q7, q9, q1

-    ; stage 4

-    vdup.16         q1, r3                    ; cospi_16_64*2

-    ; generate cospi_16_64 = 11585

-    mov             r3, #0x2d00

-    add             r3, #0x41

-    vdup.16         d4, r3;                   ; duplicate cospi_16_64

-    ; dct_const_round_shift(step1[0] * cospi_16_64)

-    vqrdmulh.s16    q8, q8, q1

-    ; step2[6] * cospi_16_64

-    vmull.s16       q9, d14, d4

-    vmull.s16       q10, d15, d4

-    ; step2[5] * cospi_16_64

-    vmull.s16       q12, d9, d4

-    vmull.s16       q11, d8, d4

-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64

-    vsub.s32        q15, q10, q12

-    vsub.s32        q6, q9, q11

-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64

-    vadd.s32        q9, q9, q11

-    vadd.s32        q10, q10, q12

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d11, q15, #14             ; >> 14

-    vqrshrn.s32     d10, q6, #14              ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d12, q9, #14              ; >> 14

-    vqrshrn.s32     d13, q10, #14             ; >> 14

-    ; stage 6

-    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];

-    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];

-    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];

-    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];

-    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];

-    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];

-    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];

-    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];

-    ; store the data

-    vst1.64         {d4}, [r1], r2

-    vst1.64         {d5}, [r1], r2

-    vst1.64         {d18}, [r1], r2

-    vst1.64         {d19}, [r1], r2

-    vst1.64         {d20}, [r1], r2

-    vst1.64         {d21}, [r1], r2

-    vst1.64         {d22}, [r1], r2

-    vst1.64         {d23}, [r1], r2

-    vst1.64         {d24}, [r1], r2

-    vst1.64         {d25}, [r1], r2

-    vst1.64         {d26}, [r1], r2

-    vst1.64         {d27}, [r1], r2

-    vst1.64         {d28}, [r1], r2

-    vst1.64         {d29}, [r1], r2

-    vst1.64         {d30}, [r1], r2

-    vst1.64         {d31}, [r1], r2

-    bx              lr

-    ENDP  ; |vp9_idct16x16_10_add_neon_pass1|

-;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,

-;                                           int16_t *output,

-;                                           int16_t *pass1Output,

-;                                           int16_t skip_adding,

-;                                           uint8_t *dest,

-;                                           int dest_stride)

-;

-; r0  int16_t *src

-; r1  int16_t *output,

-; r2  int16_t *pass1Output,

-; r3  int16_t skip_adding,

-; r4  uint8_t *dest,

-; r5  int dest_stride)

-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

-; will be stored back into q8-q15 registers. This function will touch q0-q7

-; registers and use them as buffer during calculation.

-|vp9_idct16x16_10_add_neon_pass2| PROC

-    push            {r3-r9}

-    ; TODO(hkuang): Find a better way to load the elements.

-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15

-    vld2.s16        {q8,q9}, [r0]!

-    vld2.s16        {q9,q10}, [r0]!

-    vld2.s16        {q10,q11}, [r0]!

-    vld2.s16        {q11,q12}, [r0]!

-    vld2.s16        {q12,q13}, [r0]!

-    vld2.s16        {q13,q14}, [r0]!

-    vld2.s16        {q14,q15}, [r0]!

-    vld2.s16        {q0,q1}, [r0]!

-    vmov.s16        q15, q0;

-    ; generate 2*cospi_30_64 = 3212

-    mov             r3, #0xc00

-    add             r3, #0x8c

-    ; generate 2*cospi_2_64  = 32610

-    mov             r12, #0x7f00

-    add             r12, #0x62

-    ; transpose the input data

-    TRANSPOSE8X8

-    ; stage 3

-    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64

-    ; dct_const_round_shift(step1[8] * cospi_30_64)

-    vqrdmulh.s16    q0, q8, q6

-    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64

-    ; dct_const_round_shift(step1[8] * cospi_2_64)

-    vqrdmulh.s16    q7, q8, q6

-    ; preloading to avoid stall

-    ; generate 2*cospi_26_64 = 9512

-    mov             r12, #0x2500

-    add             r12, #0x28

-    rsb             r12, #0

-    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64

-    ; generate 2*cospi_6_64 = 31358

-    mov             r3, #0x7a00

-    add             r3, #0x7e

-    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64

-    ; dct_const_round_shift(- step1[12] * cospi_26_64)

-    vqrdmulh.s16    q3, q9, q15

-    ; dct_const_round_shift(step1[12] * cospi_6_64)

-    vqrdmulh.s16    q4, q9, q14

-    ; stage 4

-    ; generate cospi_24_64 = 6270

-    mov             r3, #0x1800

-    add             r3, #0x7e

-    vdup.16         d31, r3                   ; duplicate cospi_24_64

-    ; generate cospi_8_64 = 15137

-    mov             r12, #0x3b00

-    add             r12, #0x21

-    vdup.16         d30, r12                  ; duplicate cospi_8_64

-    ; step1[14] * cospi_24_64

-    vmull.s16       q12, d14, d31

-    vmull.s16       q5, d15, d31

-    ; step1[9] * cospi_24_64

-    vmull.s16       q2, d0, d31

-    vmull.s16       q11, d1, d31

-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64

-    vmlsl.s16       q12, d0, d30

-    vmlsl.s16       q5, d1, d30

-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64

-    vmlal.s16       q2, d14, d30

-    vmlal.s16       q11, d15, d30

-    rsb              r12, #0

-    vdup.16          d30, r12                 ; duplicate -cospi_8_64

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d2, q12, #14              ; >> 14

-    vqrshrn.s32     d3, q5, #14               ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d12, q2, #14              ; >> 14

-    vqrshrn.s32     d13, q11, #14             ; >> 14

-    ; - step1[13] * cospi_8_64

-    vmull.s16       q10, d8, d30

-    vmull.s16       q13, d9, d30

-    ; -step1[10] * cospi_8_64

-    vmull.s16       q8, d6, d30

-    vmull.s16       q9, d7, d30

-    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64

-    vmlsl.s16       q10, d6, d31

-    vmlsl.s16       q13, d7, d31

-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64

-    vmlal.s16       q8, d8, d31

-    vmlal.s16       q9, d9, d31

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d4, q10, #14              ; >> 14

-    vqrshrn.s32     d5, q13, #14              ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d10, q8, #14              ; >> 14

-    vqrshrn.s32     d11, q9, #14              ; >> 14

-    ; stage 5

-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];

-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];

-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];

-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];

-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];

-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];

-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];

-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];

-    ; stage 6.

-    ; generate cospi_16_64 = 11585

-    mov             r12, #0x2d00

-    add             r12, #0x41

-    vdup.16         d14, r12                  ; duplicate cospi_16_64

-    ; step1[13] * cospi_16_64

-    vmull.s16       q3, d26, d14

-    vmull.s16       q4, d27, d14

-    ; step1[10] * cospi_16_64

-    vmull.s16       q0, d20, d14

-    vmull.s16       q1, d21, d14

-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64

-    vsub.s32        q5, q3, q0

-    vsub.s32        q6, q4, q1

-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64

-    vadd.s32        q0, q3, q0

-    vadd.s32        q1, q4, q1

-    ; dct_const_round_shift(temp1)

-    vqrshrn.s32     d4, q5, #14               ; >> 14

-    vqrshrn.s32     d5, q6, #14               ; >> 14

-    ; dct_const_round_shift(temp2)

-    vqrshrn.s32     d10, q0, #14              ; >> 14

-    vqrshrn.s32     d11, q1, #14              ; >> 14

-    ; step1[11] * cospi_16_64

-    vmull.s16       q0, d22, d14

-    vmull.s16       q1, d23, d14

-    ; step1[12] * cospi_16_64

-    vmull.s16       q13, d24, d14

-    vmull.s16       q6, d25, d14

-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64

-    vsub.s32        q10, q13, q0

-    vsub.s32        q4, q6, q1

-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64

-    vadd.s32        q13, q13, q0

-    vadd.s32        q6, q6, q1

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d6, q10, #14              ; >> 14

-    vqrshrn.s32     d7, q4, #14               ; >> 14

-    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);

-    vqrshrn.s32     d8, q13, #14              ; >> 14

-    vqrshrn.s32     d9, q6, #14               ; >> 14

-    mov              r4, #16                  ; pass1Output stride

-    ldr              r3, [sp]                 ; load skip_adding

-    ; stage 7

-    ; load the data in pass1

-    mov              r5, #24

-    mov              r3, #8

-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]

-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]

-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]

-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]

-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]

-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]

-    vst1.64         {d24}, [r1], r3           ; store output[0]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[1]

-    vst1.64         {d27}, [r1], r5

-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]

-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]

-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]

-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]

-    vst1.64         {d24}, [r1], r3           ; store output[2]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[3]

-    vst1.64         {d27}, [r1], r5

-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]

-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]

-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]

-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]

-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]

-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]

-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]

-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]

-    vst1.64         {d24}, [r1], r3           ; store output[4]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[5]

-    vst1.64         {d27}, [r1], r5

-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]

-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]

-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]

-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]

-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]

-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]

-    vst1.64         {d24}, [r1], r3           ; store output[6]

-    vst1.64         {d25}, [r1], r5

-    vst1.64         {d26}, [r1], r3           ; store output[7]

-    vst1.64         {d27}, [r1], r5

-    ; store the data  output 8,9,10,11,12,13,14,15

-    vst1.64         {d16}, [r1], r3

-    vst1.64         {d17}, [r1], r5

-    vst1.64         {d18}, [r1], r3

-    vst1.64         {d19}, [r1], r5

-    vst1.64         {d4}, [r1], r3

-    vst1.64         {d5}, [r1], r5

-    vst1.64         {d6}, [r1], r3

-    vst1.64         {d7}, [r1], r5

-    vst1.64         {d8}, [r1], r3

-    vst1.64         {d9}, [r1], r5

-    vst1.64         {d10}, [r1], r3

-    vst1.64         {d11}, [r1], r5

-    vst1.64         {d28}, [r1], r3

-    vst1.64         {d29}, [r1], r5

-    vst1.64         {d30}, [r1], r3

-    vst1.64         {d31}, [r1], r5

-end_idct10_16x16_pass2

-    pop             {r3-r9}

-    bx              lr

-    ENDP  ; |vp9_idct16x16_10_add_neon_pass2|

-    END

--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c

+++ /dev/null

@@ -1,186 +1,0 @@

-/*

- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp9_rtcd.h"

-#include "vp9/common/vp9_common.h"

-void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,

-                                      int16_t *output,

-                                      int output_stride);

-void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,

-                                      int16_t *output,

-                                      int16_t *pass1Output,

-                                      int16_t skip_adding,

-                                      uint8_t *dest,

-                                      int dest_stride);

-void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,

-                                     int16_t *output,

-                                     int output_stride);

-void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,

-                                     int16_t *output,

-                                     int16_t *pass1Output,

-                                     int16_t skip_adding,

-                                     uint8_t *dest,

-                                     int dest_stride);

-#if HAVE_NEON_ASM

-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */

-extern void vp9_push_neon(int64_t *store);

-extern void vp9_pop_neon(int64_t *store);

-#endif  // HAVE_NEON_ASM

-void vp9_idct16x16_256_add_neon(const int16_t *input,

-                                uint8_t *dest, int dest_stride) {

-#if HAVE_NEON_ASM

-  int64_t store_reg[8];

-#endif

-  int16_t pass1_output[16*16] = {0};

-  int16_t row_idct_output[16*16] = {0};

-#if HAVE_NEON_ASM

-  // save d8-d15 register values.

-  vp9_push_neon(store_reg);

-#endif

-  /* Parallel idct on the upper 8 rows */

-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

-  // stage 6 result in pass1_output.

-  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);

-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

-  // with result in pass1(pass1_output) to calculate final result in stage 7

-  // which will be saved into row_idct_output.

-  vp9_idct16x16_256_add_neon_pass2(input+1,

-                                     row_idct_output,

-                                     pass1_output,

-                                     0,

-                                     dest,

-                                     dest_stride);

-  /* Parallel idct on the lower 8 rows */

-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

-  // stage 6 result in pass1_output.

-  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);

-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

-  // with result in pass1(pass1_output) to calculate final result in stage 7

-  // which will be saved into row_idct_output.

-  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,

-                                     row_idct_output+8,

-                                     pass1_output,

-                                     0,

-                                     dest,

-                                     dest_stride);

-  /* Parallel idct on the left 8 columns */

-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

-  // stage 6 result in pass1_output.

-  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

-  // with result in pass1(pass1_output) to calculate final result in stage 7.

-  // Then add the result to the destination data.

-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,

-                                     row_idct_output,

-                                     pass1_output,

-                                     1,

-                                     dest,

-                                     dest_stride);

-  /* Parallel idct on the right 8 columns */

-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

-  // stage 6 result in pass1_output.

-  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

-  // with result in pass1(pass1_output) to calculate final result in stage 7.

-  // Then add the result to the destination data.

-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

-                                     row_idct_output+8,

-                                     pass1_output,

-                                     1,

-                                     dest+8,

-                                     dest_stride);

-#if HAVE_NEON_ASM

-  // restore d8-d15 register values.

-  vp9_pop_neon(store_reg);

-#endif

-  return;

-}

-void vp9_idct16x16_10_add_neon(const int16_t *input,

-                               uint8_t *dest, int dest_stride) {

-#if HAVE_NEON_ASM

-  int64_t store_reg[8];

-#endif

-  int16_t pass1_output[16*16] = {0};

-  int16_t row_idct_output[16*16] = {0};

-#if HAVE_NEON_ASM

-  // save d8-d15 register values.

-  vp9_push_neon(store_reg);

-#endif

-  /* Parallel idct on the upper 8 rows */

-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

-  // stage 6 result in pass1_output.

-  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);

-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

-  // with result in pass1(pass1_output) to calculate final result in stage 7

-  // which will be saved into row_idct_output.

-  vp9_idct16x16_10_add_neon_pass2(input+1,

-                                        row_idct_output,

-                                        pass1_output,

-                                        0,

-                                        dest,

-                                        dest_stride);

-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */

-  /* Parallel idct on the left 8 columns */

-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

-  // stage 6 result in pass1_output.

-  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

-  // with result in pass1(pass1_output) to calculate final result in stage 7.

-  // Then add the result to the destination data.

-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,

-                                     row_idct_output,

-                                     pass1_output,

-                                     1,

-                                     dest,

-                                     dest_stride);

-  /* Parallel idct on the right 8 columns */

-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

-  // stage 6 result in pass1_output.

-  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

-  // with result in pass1(pass1_output) to calculate final result in stage 7.

-  // Then add the result to the destination data.

-  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

-                                     row_idct_output+8,

-                                     pass1_output,

-                                     1,

-                                     dest+8,

-                                     dest_stride);

-#if HAVE_NEON_ASM

-  // restore d8-d15 register values.

-  vp9_pop_neon(store_reg);

-#endif

-  return;

-}

--- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c

+++ /dev/null

@@ -1,165 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "./vpx_config.h"

-#include "vp9/common/vp9_idct.h"

-#include "vpx_ports/mem.h"

-static INLINE void LD_16x8(

-        uint8_t *d,

-        int d_stride,

-        uint8x16_t *q8u8,

-        uint8x16_t *q9u8,

-        uint8x16_t *q10u8,

-        uint8x16_t *q11u8,

-        uint8x16_t *q12u8,

-        uint8x16_t *q13u8,

-        uint8x16_t *q14u8,

-        uint8x16_t *q15u8) {

-    *q8u8 = vld1q_u8(d);

-    d += d_stride;

-    *q9u8 = vld1q_u8(d);

-    d += d_stride;

-    *q10u8 = vld1q_u8(d);

-    d += d_stride;

-    *q11u8 = vld1q_u8(d);

-    d += d_stride;

-    *q12u8 = vld1q_u8(d);

-    d += d_stride;

-    *q13u8 = vld1q_u8(d);

-    d += d_stride;

-    *q14u8 = vld1q_u8(d);

-    d += d_stride;

-    *q15u8 = vld1q_u8(d);

-    return;

-}

-static INLINE void ADD_DIFF_16x8(

-        uint8x16_t qdiffu8,

-        uint8x16_t *q8u8,

-        uint8x16_t *q9u8,

-        uint8x16_t *q10u8,

-        uint8x16_t *q11u8,

-        uint8x16_t *q12u8,

-        uint8x16_t *q13u8,

-        uint8x16_t *q14u8,

-        uint8x16_t *q15u8) {

-    *q8u8 = vqaddq_u8(*q8u8, qdiffu8);

-    *q9u8 = vqaddq_u8(*q9u8, qdiffu8);

-    *q10u8 = vqaddq_u8(*q10u8, qdiffu8);

-    *q11u8 = vqaddq_u8(*q11u8, qdiffu8);

-    *q12u8 = vqaddq_u8(*q12u8, qdiffu8);

-    *q13u8 = vqaddq_u8(*q13u8, qdiffu8);

-    *q14u8 = vqaddq_u8(*q14u8, qdiffu8);

-    *q15u8 = vqaddq_u8(*q15u8, qdiffu8);

-    return;

-}

-static INLINE void SUB_DIFF_16x8(

-        uint8x16_t qdiffu8,

-        uint8x16_t *q8u8,

-        uint8x16_t *q9u8,

-        uint8x16_t *q10u8,

-        uint8x16_t *q11u8,

-        uint8x16_t *q12u8,

-        uint8x16_t *q13u8,

-        uint8x16_t *q14u8,

-        uint8x16_t *q15u8) {

-    *q8u8 = vqsubq_u8(*q8u8, qdiffu8);

-    *q9u8 = vqsubq_u8(*q9u8, qdiffu8);

-    *q10u8 = vqsubq_u8(*q10u8, qdiffu8);

-    *q11u8 = vqsubq_u8(*q11u8, qdiffu8);

-    *q12u8 = vqsubq_u8(*q12u8, qdiffu8);

-    *q13u8 = vqsubq_u8(*q13u8, qdiffu8);

-    *q14u8 = vqsubq_u8(*q14u8, qdiffu8);

-    *q15u8 = vqsubq_u8(*q15u8, qdiffu8);

-    return;

-}

-static INLINE void ST_16x8(

-        uint8_t *d,

-        int d_stride,

-        uint8x16_t *q8u8,

-        uint8x16_t *q9u8,

-        uint8x16_t *q10u8,

-        uint8x16_t *q11u8,

-        uint8x16_t *q12u8,

-        uint8x16_t *q13u8,

-        uint8x16_t *q14u8,

-        uint8x16_t *q15u8) {

-    vst1q_u8(d, *q8u8);

-    d += d_stride;

-    vst1q_u8(d, *q9u8);

-    d += d_stride;

-    vst1q_u8(d, *q10u8);

-    d += d_stride;

-    vst1q_u8(d, *q11u8);

-    d += d_stride;

-    vst1q_u8(d, *q12u8);

-    d += d_stride;

-    vst1q_u8(d, *q13u8);

-    d += d_stride;

-    vst1q_u8(d, *q14u8);

-    d += d_stride;

-    vst1q_u8(d, *q15u8);

-    return;

-}

-void vp9_idct32x32_1_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;

-    int i, j, dest_stride8;

-    uint8_t *d;

-    int16_t a1, cospi_16_64 = 11585;

-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-    out = dct_const_round_shift(out * cospi_16_64);

-    a1 = ROUND_POWER_OF_TWO(out, 6);

-    dest_stride8 = dest_stride * 8;

-    if (a1 >= 0) {  // diff_positive_32_32

-        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;

-        q0u8 = vdupq_n_u8(a1);

-        for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop

-            d = dest;

-            for (j = 0; j < 4; j++) {

-                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

-                                        &q12u8, &q13u8, &q14u8, &q15u8);

-                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,

-                                    &q12u8, &q13u8, &q14u8, &q15u8);

-                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

-                                        &q12u8, &q13u8, &q14u8, &q15u8);

-                d += dest_stride8;

-            }

-        }

-    } else {  // diff_negative_32_32

-        a1 = -a1;

-        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;

-        q0u8 = vdupq_n_u8(a1);

-        for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop

-            d = dest;

-            for (j = 0; j < 4; j++) {

-                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

-                                        &q12u8, &q13u8, &q14u8, &q15u8);

-                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,

-                                    &q12u8, &q13u8, &q14u8, &q15u8);

-                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

-                                        &q12u8, &q13u8, &q14u8, &q15u8);

-                d += dest_stride8;

-            }

-        }

-    }

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm

+++ /dev/null

@@ -1,144 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |vp9_idct32x32_1_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-    ;TODO(hkuang): put the following macros in a seperate

-    ;file so other idct function could also use them.

-    MACRO

-    LD_16x8          $src, $stride

-    vld1.8           {q8}, [$src], $stride

-    vld1.8           {q9}, [$src], $stride

-    vld1.8           {q10}, [$src], $stride

-    vld1.8           {q11}, [$src], $stride

-    vld1.8           {q12}, [$src], $stride

-    vld1.8           {q13}, [$src], $stride

-    vld1.8           {q14}, [$src], $stride

-    vld1.8           {q15}, [$src], $stride

-    MEND

-    MACRO

-    ADD_DIFF_16x8    $diff

-    vqadd.u8         q8, q8, $diff

-    vqadd.u8         q9, q9, $diff

-    vqadd.u8         q10, q10, $diff

-    vqadd.u8         q11, q11, $diff

-    vqadd.u8         q12, q12, $diff

-    vqadd.u8         q13, q13, $diff

-    vqadd.u8         q14, q14, $diff

-    vqadd.u8         q15, q15, $diff

-    MEND

-    MACRO

-    SUB_DIFF_16x8    $diff

-    vqsub.u8         q8, q8, $diff

-    vqsub.u8         q9, q9, $diff

-    vqsub.u8         q10, q10, $diff

-    vqsub.u8         q11, q11, $diff

-    vqsub.u8         q12, q12, $diff

-    vqsub.u8         q13, q13, $diff

-    vqsub.u8         q14, q14, $diff

-    vqsub.u8         q15, q15, $diff

-    MEND

-    MACRO

-    ST_16x8          $dst, $stride

-    vst1.8           {q8}, [$dst], $stride

-    vst1.8           {q9}, [$dst], $stride

-    vst1.8           {q10},[$dst], $stride

-    vst1.8           {q11},[$dst], $stride

-    vst1.8           {q12},[$dst], $stride

-    vst1.8           {q13},[$dst], $stride

-    vst1.8           {q14},[$dst], $stride

-    vst1.8           {q15},[$dst], $stride

-    MEND

-;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,

-;                              int dest_stride)

-;

-; r0  int16_t input

-; r1  uint8_t *dest

-; r2  int dest_stride

-|vp9_idct32x32_1_add_neon| PROC

-    push             {lr}

-    pld              [r1]

-    add              r3, r1, #16               ; r3 dest + 16 for second loop

-    ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

-    ; out = dct_const_round_shift(input[0] * cospi_16_64)

-    mul              r0, r0, r12               ; input[0] * cospi_16_64

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; out = dct_const_round_shift(out * cospi_16_64)

-    mul              r0, r0, r12               ; out * cospi_16_64

-    mov              r12, r1                   ; save dest

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; a1 = ROUND_POWER_OF_TWO(out, 6)

-    add              r0, r0, #32               ; + (1 <<((6) - 1))

-    asrs             r0, r0, #6                ; >> 6

-    bge              diff_positive_32_32

-diff_negative_32_32

-    neg              r0, r0

-    usat             r0, #8, r0

-    vdup.u8          q0, r0

-    mov              r0, #4

-diff_negative_32_32_loop

-    sub              r0, #1

-    LD_16x8          r1, r2

-    SUB_DIFF_16x8    q0

-    ST_16x8          r12, r2

-    LD_16x8          r1, r2

-    SUB_DIFF_16x8    q0

-    ST_16x8          r12, r2

-    cmp              r0, #2

-    moveq            r1, r3

-    moveq            r12, r3

-    cmp              r0, #0

-    bne              diff_negative_32_32_loop

-    pop              {pc}

-diff_positive_32_32

-    usat             r0, #8, r0

-    vdup.u8          q0, r0

-    mov              r0, #4

-diff_positive_32_32_loop

-    sub              r0, #1

-    LD_16x8          r1, r2

-    ADD_DIFF_16x8    q0

-    ST_16x8          r12, r2

-    LD_16x8          r1, r2

-    ADD_DIFF_16x8    q0

-    ST_16x8          r12, r2

-    cmp              r0, #2

-    moveq            r1, r3

-    moveq            r12, r3

-    cmp              r0, #0

-    bne              diff_positive_32_32_loop

-    pop              {pc}

-    ENDP             ; |vp9_idct32x32_1_add_neon|

-    END

--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.c

+++ /dev/null

@@ -1,719 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "./vpx_config.h"

-#include "vpx_dsp/txfm_common.h"

-#define LOAD_FROM_TRANSPOSED(prev, first, second) \

-    q14s16 = vld1q_s16(trans_buf + first * 8); \

-    q13s16 = vld1q_s16(trans_buf + second * 8);

-#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \

-    qA = vld1q_s16(out + first * 32); \

-    qB = vld1q_s16(out + second * 32);

-#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \

-    vst1q_s16(out + first * 32, qA); \

-    vst1q_s16(out + second * 32, qB);

-#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \

-       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \

-                                      q6s16, q7s16, q8s16, q9s16);

-static INLINE void __STORE_COMBINE_CENTER_RESULTS(

-        uint8_t *p1,

-        uint8_t *p2,

-        int stride,

-        int16x8_t q6s16,

-        int16x8_t q7s16,

-        int16x8_t q8s16,

-        int16x8_t q9s16) {

-    int16x4_t d8s16, d9s16, d10s16, d11s16;

-    d8s16 = vld1_s16((int16_t *)p1);

-    p1 += stride;

-    d11s16 = vld1_s16((int16_t *)p2);

-    p2 -= stride;

-    d9s16 = vld1_s16((int16_t *)p1);

-    d10s16 = vld1_s16((int16_t *)p2);

-    q7s16 = vrshrq_n_s16(q7s16, 6);

-    q8s16 = vrshrq_n_s16(q8s16, 6);

-    q9s16 = vrshrq_n_s16(q9s16, 6);

-    q6s16 = vrshrq_n_s16(q6s16, 6);

-    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),

-                                           vreinterpret_u8_s16(d9s16)));

-    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),

-                                           vreinterpret_u8_s16(d10s16)));

-    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),

-                                           vreinterpret_u8_s16(d11s16)));

-    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),

-                                           vreinterpret_u8_s16(d8s16)));

-    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));

-    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));

-    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));

-    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));

-    vst1_s16((int16_t *)p1, d9s16);

-    p1 -= stride;

-    vst1_s16((int16_t *)p2, d10s16);

-    p2 += stride;

-    vst1_s16((int16_t *)p1, d8s16);

-    vst1_s16((int16_t *)p2, d11s16);

-    return;

-}

-#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \

-       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \

-                                      q4s16, q5s16, q6s16, q7s16);

-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(

-        uint8_t *p1,

-        uint8_t *p2,

-        int stride,

-        int16x8_t q4s16,

-        int16x8_t q5s16,

-        int16x8_t q6s16,

-        int16x8_t q7s16) {

-    int16x4_t d4s16, d5s16, d6s16, d7s16;

-    d4s16 = vld1_s16((int16_t *)p1);

-    p1 += stride;

-    d7s16 = vld1_s16((int16_t *)p2);

-    p2 -= stride;

-    d5s16 = vld1_s16((int16_t *)p1);

-    d6s16 = vld1_s16((int16_t *)p2);

-    q5s16 = vrshrq_n_s16(q5s16, 6);

-    q6s16 = vrshrq_n_s16(q6s16, 6);

-    q7s16 = vrshrq_n_s16(q7s16, 6);

-    q4s16 = vrshrq_n_s16(q4s16, 6);

-    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),

-                                           vreinterpret_u8_s16(d5s16)));

-    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),

-                                           vreinterpret_u8_s16(d6s16)));

-    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),

-                                           vreinterpret_u8_s16(d7s16)));

-    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),

-                                           vreinterpret_u8_s16(d4s16)));

-    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));

-    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));

-    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));

-    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));

-    vst1_s16((int16_t *)p1, d5s16);

-    p1 -= stride;

-    vst1_s16((int16_t *)p2, d6s16);

-    p2 += stride;

-    vst1_s16((int16_t *)p2, d7s16);

-    vst1_s16((int16_t *)p1, d4s16);

-    return;

-}

-#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \

-        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);

-static INLINE void DO_BUTTERFLY(

-        int16x8_t q14s16,

-        int16x8_t q13s16,

-        int16_t first_const,

-        int16_t second_const,

-        int16x8_t *qAs16,

-        int16x8_t *qBs16) {

-    int16x4_t d30s16, d31s16;

-    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;

-    int16x4_t dCs16, dDs16, dAs16, dBs16;

-    dCs16 = vget_low_s16(q14s16);

-    dDs16 = vget_high_s16(q14s16);

-    dAs16 = vget_low_s16(q13s16);

-    dBs16 = vget_high_s16(q13s16);

-    d30s16 = vdup_n_s16(first_const);

-    d31s16 = vdup_n_s16(second_const);

-    q8s32 = vmull_s16(dCs16, d30s16);

-    q10s32 = vmull_s16(dAs16, d31s16);

-    q9s32 = vmull_s16(dDs16, d30s16);

-    q11s32 = vmull_s16(dBs16, d31s16);

-    q12s32 = vmull_s16(dCs16, d31s16);

-    q8s32 = vsubq_s32(q8s32, q10s32);

-    q9s32 = vsubq_s32(q9s32, q11s32);

-    q10s32 = vmull_s16(dDs16, d31s16);

-    q11s32 = vmull_s16(dAs16, d30s16);

-    q15s32 = vmull_s16(dBs16, d30s16);

-    q11s32 = vaddq_s32(q12s32, q11s32);

-    q10s32 = vaddq_s32(q10s32, q15s32);

-    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),

-                          vqrshrn_n_s32(q9s32, 14));

-    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),

-                          vqrshrn_n_s32(q10s32, 14));

-    return;

-}

-static INLINE void idct32_transpose_pair(

-        int16_t *input,

-        int16_t *t_buf) {

-    int16_t *in;

-    int i;

-    const int stride = 32;

-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;

-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;

-    for (i = 0; i < 4; i++, input += 8) {

-        in = input;

-        q8s16 = vld1q_s16(in);

-        in += stride;

-        q9s16 = vld1q_s16(in);

-        in += stride;

-        q10s16 = vld1q_s16(in);

-        in += stride;

-        q11s16 = vld1q_s16(in);

-        in += stride;

-        q12s16 = vld1q_s16(in);

-        in += stride;

-        q13s16 = vld1q_s16(in);

-        in += stride;

-        q14s16 = vld1q_s16(in);

-        in += stride;

-        q15s16 = vld1q_s16(in);

-        d16s16 = vget_low_s16(q8s16);

-        d17s16 = vget_high_s16(q8s16);

-        d18s16 = vget_low_s16(q9s16);

-        d19s16 = vget_high_s16(q9s16);

-        d20s16 = vget_low_s16(q10s16);

-        d21s16 = vget_high_s16(q10s16);

-        d22s16 = vget_low_s16(q11s16);

-        d23s16 = vget_high_s16(q11s16);

-        d24s16 = vget_low_s16(q12s16);

-        d25s16 = vget_high_s16(q12s16);

-        d26s16 = vget_low_s16(q13s16);

-        d27s16 = vget_high_s16(q13s16);

-        d28s16 = vget_low_s16(q14s16);

-        d29s16 = vget_high_s16(q14s16);

-        d30s16 = vget_low_s16(q15s16);

-        d31s16 = vget_high_s16(q15s16);

-        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24

-        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26

-        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28

-        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30

-        q12s16 = vcombine_s16(d17s16, d25s16);

-        q13s16 = vcombine_s16(d19s16, d27s16);

-        q14s16 = vcombine_s16(d21s16, d29s16);

-        q15s16 = vcombine_s16(d23s16, d31s16);

-        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),

-                            vreinterpretq_s32_s16(q10s16));

-        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),

-                            vreinterpretq_s32_s16(q11s16));

-        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),

-                            vreinterpretq_s32_s16(q14s16));

-        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),

-                            vreinterpretq_s32_s16(q15s16));

-        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8

-                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9

-        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10

-                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11

-        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12

-                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13

-        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14

-                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15

-        vst1q_s16(t_buf, q0x2s16.val[0]);

-        t_buf += 8;

-        vst1q_s16(t_buf, q0x2s16.val[1]);

-        t_buf += 8;

-        vst1q_s16(t_buf, q1x2s16.val[0]);

-        t_buf += 8;

-        vst1q_s16(t_buf, q1x2s16.val[1]);

-        t_buf += 8;

-        vst1q_s16(t_buf, q2x2s16.val[0]);

-        t_buf += 8;

-        vst1q_s16(t_buf, q2x2s16.val[1]);

-        t_buf += 8;

-        vst1q_s16(t_buf, q3x2s16.val[0]);

-        t_buf += 8;

-        vst1q_s16(t_buf, q3x2s16.val[1]);

-        t_buf += 8;

-    }

-    return;

-}

-static INLINE void idct32_bands_end_1st_pass(

-        int16_t *out,

-        int16x8_t q2s16,

-        int16x8_t q3s16,

-        int16x8_t q6s16,

-        int16x8_t q7s16,

-        int16x8_t q8s16,

-        int16x8_t q9s16,

-        int16x8_t q10s16,

-        int16x8_t q11s16,

-        int16x8_t q12s16,

-        int16x8_t q13s16,

-        int16x8_t q14s16,

-        int16x8_t q15s16) {

-    int16x8_t q0s16, q1s16, q4s16, q5s16;

-    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);

-    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);

-    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);

-    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);

-    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);

-    q2s16 = vaddq_s16(q10s16, q1s16);

-    q3s16 = vaddq_s16(q11s16, q0s16);

-    q4s16 = vsubq_s16(q11s16, q0s16);

-    q5s16 = vsubq_s16(q10s16, q1s16);

-    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);

-    q8s16 = vaddq_s16(q4s16, q1s16);

-    q9s16 = vaddq_s16(q5s16, q0s16);

-    q6s16 = vsubq_s16(q5s16, q0s16);

-    q7s16 = vsubq_s16(q4s16, q1s16);

-    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);

-    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);

-    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);

-    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);

-    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);

-    q2s16 = vaddq_s16(q12s16, q1s16);

-    q3s16 = vaddq_s16(q13s16, q0s16);

-    q4s16 = vsubq_s16(q13s16, q0s16);

-    q5s16 = vsubq_s16(q12s16, q1s16);

-    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);

-    q8s16 = vaddq_s16(q4s16, q1s16);

-    q9s16 = vaddq_s16(q5s16, q0s16);

-    q6s16 = vsubq_s16(q5s16, q0s16);

-    q7s16 = vsubq_s16(q4s16, q1s16);

-    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);

-    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);

-    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);

-    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);

-    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);

-    q2s16 = vaddq_s16(q14s16, q1s16);

-    q3s16 = vaddq_s16(q15s16, q0s16);

-    q4s16 = vsubq_s16(q15s16, q0s16);

-    q5s16 = vsubq_s16(q14s16, q1s16);

-    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);

-    q8s16 = vaddq_s16(q4s16, q1s16);

-    q9s16 = vaddq_s16(q5s16, q0s16);

-    q6s16 = vsubq_s16(q5s16, q0s16);

-    q7s16 = vsubq_s16(q4s16, q1s16);

-    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);

-    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);

-    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);

-    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);

-    return;

-}

-static INLINE void idct32_bands_end_2nd_pass(

-        int16_t *out,

-        uint8_t *dest,

-        int stride,

-        int16x8_t q2s16,

-        int16x8_t q3s16,

-        int16x8_t q6s16,

-        int16x8_t q7s16,

-        int16x8_t q8s16,

-        int16x8_t q9s16,

-        int16x8_t q10s16,

-        int16x8_t q11s16,

-        int16x8_t q12s16,

-        int16x8_t q13s16,

-        int16x8_t q14s16,

-        int16x8_t q15s16) {

-    uint8_t *r6  = dest + 31 * stride;

-    uint8_t *r7  = dest/* +  0 * stride*/;

-    uint8_t *r9  = dest + 15 * stride;

-    uint8_t *r10 = dest + 16 * stride;

-    int str2 = stride << 1;

-    int16x8_t q0s16, q1s16, q4s16, q5s16;

-    STORE_COMBINE_CENTER_RESULTS(r10, r9);

-    r10 += str2; r9 -= str2;

-    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

-    r7 += str2; r6 -= str2;

-    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)

-    q2s16 = vaddq_s16(q10s16, q1s16);

-    q3s16 = vaddq_s16(q11s16, q0s16);

-    q4s16 = vsubq_s16(q11s16, q0s16);

-    q5s16 = vsubq_s16(q10s16, q1s16);

-    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)

-    q8s16 = vaddq_s16(q4s16, q1s16);

-    q9s16 = vaddq_s16(q5s16, q0s16);

-    q6s16 = vsubq_s16(q5s16, q0s16);

-    q7s16 = vsubq_s16(q4s16, q1s16);

-    STORE_COMBINE_CENTER_RESULTS(r10, r9);

-    r10 += str2; r9 -= str2;

-    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

-    r7 += str2; r6 -= str2;

-    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)

-    q2s16 = vaddq_s16(q12s16, q1s16);

-    q3s16 = vaddq_s16(q13s16, q0s16);

-    q4s16 = vsubq_s16(q13s16, q0s16);

-    q5s16 = vsubq_s16(q12s16, q1s16);

-    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)

-    q8s16 = vaddq_s16(q4s16, q1s16);

-    q9s16 = vaddq_s16(q5s16, q0s16);

-    q6s16 = vsubq_s16(q5s16, q0s16);

-    q7s16 = vsubq_s16(q4s16, q1s16);

-    STORE_COMBINE_CENTER_RESULTS(r10, r9);

-    r10 += str2; r9 -= str2;

-    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

-    r7 += str2; r6 -= str2;

-    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)

-    q2s16 = vaddq_s16(q14s16, q1s16);

-    q3s16 = vaddq_s16(q15s16, q0s16);

-    q4s16 = vsubq_s16(q15s16, q0s16);

-    q5s16 = vsubq_s16(q14s16, q1s16);

-    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)

-    q8s16 = vaddq_s16(q4s16, q1s16);

-    q9s16 = vaddq_s16(q5s16, q0s16);

-    q6s16 = vsubq_s16(q5s16, q0s16);

-    q7s16 = vsubq_s16(q4s16, q1s16);

-    STORE_COMBINE_CENTER_RESULTS(r10, r9);

-    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)

-    q4s16 = vaddq_s16(q2s16, q1s16);

-    q5s16 = vaddq_s16(q3s16, q0s16);

-    q6s16 = vsubq_s16(q3s16, q0s16);

-    q7s16 = vsubq_s16(q2s16, q1s16);

-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

-    return;

-}

-void vp9_idct32x32_1024_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int stride) {

-    int i, idct32_pass_loop;

-    int16_t trans_buf[32 * 8];

-    int16_t pass1[32 * 32];

-    int16_t pass2[32 * 32];

-    int16_t *out;

-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    for (idct32_pass_loop = 0, out = pass1;

-         idct32_pass_loop < 2;

-         idct32_pass_loop++,

-         input = pass1,  // the input of pass2 is the result of pass1

-         out = pass2) {

-        for (i = 0;

-             i < 4; i++,

-             input += 32 * 8, out += 8) {  // idct32_bands_loop

-            idct32_transpose_pair(input, trans_buf);

-            // -----------------------------------------

-            // BLOCK A: 16-19,28-31

-            // -----------------------------------------

-            // generate 16,17,30,31

-            // part of stage 1

-            LOAD_FROM_TRANSPOSED(0, 1, 31)

-            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)

-            LOAD_FROM_TRANSPOSED(31, 17, 15)

-            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)

-            // part of stage 2

-            q4s16 = vaddq_s16(q0s16, q1s16);

-            q13s16 = vsubq_s16(q0s16, q1s16);

-            q6s16 = vaddq_s16(q2s16, q3s16);

-            q14s16 = vsubq_s16(q2s16, q3s16);

-            // part of stage 3

-            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)

-            // generate 18,19,28,29

-            // part of stage 1

-            LOAD_FROM_TRANSPOSED(15, 9, 23)

-            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)

-            LOAD_FROM_TRANSPOSED(23, 25, 7)

-            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)

-            // part of stage 2

-            q13s16 = vsubq_s16(q3s16, q2s16);

-            q3s16 = vaddq_s16(q3s16, q2s16);

-            q14s16 = vsubq_s16(q1s16, q0s16);

-            q2s16 = vaddq_s16(q1s16, q0s16);

-            // part of stage 3

-            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)

-            // part of stage 4

-            q8s16 = vaddq_s16(q4s16, q2s16);

-            q9s16 = vaddq_s16(q5s16, q0s16);

-            q10s16 = vaddq_s16(q7s16, q1s16);

-            q15s16 = vaddq_s16(q6s16, q3s16);

-            q13s16 = vsubq_s16(q5s16, q0s16);

-            q14s16 = vsubq_s16(q7s16, q1s16);

-            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)

-            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)

-            // part of stage 5

-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)

-            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)

-            // part of stage 4

-            q13s16 = vsubq_s16(q4s16, q2s16);

-            q14s16 = vsubq_s16(q6s16, q3s16);

-            // part of stage 5

-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)

-            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)

-            // -----------------------------------------

-            // BLOCK B: 20-23,24-27

-            // -----------------------------------------

-            // generate 20,21,26,27

-            // part of stage 1

-            LOAD_FROM_TRANSPOSED(7, 5, 27)

-            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)

-            LOAD_FROM_TRANSPOSED(27, 21, 11)

-            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)

-            // part of stage 2

-            q13s16 = vsubq_s16(q0s16, q1s16);

-            q0s16 = vaddq_s16(q0s16, q1s16);

-            q14s16 = vsubq_s16(q2s16, q3s16);

-            q2s16 = vaddq_s16(q2s16, q3s16);

-            // part of stage 3

-            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)

-            // generate 22,23,24,25

-            // part of stage 1

-            LOAD_FROM_TRANSPOSED(11, 13, 19)

-            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)

-            LOAD_FROM_TRANSPOSED(19, 29, 3)

-            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)

-            // part of stage 2

-            q14s16 = vsubq_s16(q4s16, q5s16);

-            q5s16  = vaddq_s16(q4s16, q5s16);

-            q13s16 = vsubq_s16(q6s16, q7s16);

-            q6s16  = vaddq_s16(q6s16, q7s16);

-            // part of stage 3

-            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)

-            // part of stage 4

-            q10s16 = vaddq_s16(q7s16, q1s16);

-            q11s16 = vaddq_s16(q5s16, q0s16);

-            q12s16 = vaddq_s16(q6s16, q2s16);

-            q15s16 = vaddq_s16(q4s16, q3s16);

-            // part of stage 6

-            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)

-            q8s16 = vaddq_s16(q14s16, q11s16);

-            q9s16 = vaddq_s16(q13s16, q10s16);

-            q13s16 = vsubq_s16(q13s16, q10s16);

-            q11s16 = vsubq_s16(q14s16, q11s16);

-            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)

-            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)

-            q8s16  = vsubq_s16(q9s16, q12s16);

-            q10s16 = vaddq_s16(q14s16, q15s16);

-            q14s16 = vsubq_s16(q14s16, q15s16);

-            q12s16 = vaddq_s16(q9s16, q12s16);

-            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)

-            // part of stage 7

-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)

-            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)

-            q13s16 = q11s16;

-            q14s16 = q8s16;

-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)

-            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)

-            // part of stage 4

-            q14s16 = vsubq_s16(q5s16, q0s16);

-            q13s16 = vsubq_s16(q6s16, q2s16);

-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);

-            q14s16 = vsubq_s16(q7s16, q1s16);

-            q13s16 = vsubq_s16(q4s16, q3s16);

-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);

-            // part of stage 6

-            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)

-            q8s16 = vaddq_s16(q14s16, q1s16);

-            q9s16 = vaddq_s16(q13s16, q6s16);

-            q13s16 = vsubq_s16(q13s16, q6s16);

-            q1s16 = vsubq_s16(q14s16, q1s16);

-            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)

-            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)

-            q14s16 = vsubq_s16(q8s16, q5s16);

-            q10s16 = vaddq_s16(q8s16, q5s16);

-            q11s16 = vaddq_s16(q9s16, q0s16);

-            q0s16 = vsubq_s16(q9s16, q0s16);

-            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)

-            // part of stage 7

-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)

-            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)

-            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,

-                                                         &q1s16, &q0s16);

-            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)

-            // -----------------------------------------

-            // BLOCK C: 8-10,11-15

-            // -----------------------------------------

-            // generate 8,9,14,15

-            // part of stage 2

-            LOAD_FROM_TRANSPOSED(3, 2, 30)

-            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)

-            LOAD_FROM_TRANSPOSED(30, 18, 14)

-            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)

-            // part of stage 3

-            q13s16 = vsubq_s16(q0s16, q1s16);

-            q0s16 = vaddq_s16(q0s16, q1s16);

-            q14s16 = vsubq_s16(q2s16, q3s16);

-            q2s16 = vaddq_s16(q2s16, q3s16);

-            // part of stage 4

-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)

-            // generate 10,11,12,13

-            // part of stage 2

-            LOAD_FROM_TRANSPOSED(14, 10, 22)

-            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)

-            LOAD_FROM_TRANSPOSED(22, 26, 6)

-            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)

-            // part of stage 3

-            q14s16 = vsubq_s16(q4s16, q5s16);

-            q5s16 = vaddq_s16(q4s16, q5s16);

-            q13s16 = vsubq_s16(q6s16, q7s16);

-            q6s16 = vaddq_s16(q6s16, q7s16);

-            // part of stage 4

-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)

-            // part of stage 5

-            q8s16 = vaddq_s16(q0s16, q5s16);

-            q9s16 = vaddq_s16(q1s16, q7s16);

-            q13s16 = vsubq_s16(q1s16, q7s16);

-            q14s16 = vsubq_s16(q3s16, q4s16);

-            q10s16 = vaddq_s16(q3s16, q4s16);

-            q15s16 = vaddq_s16(q2s16, q6s16);

-            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)

-            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)

-            // part of stage 6

-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)

-            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)

-            q13s16 = vsubq_s16(q0s16, q5s16);

-            q14s16 = vsubq_s16(q2s16, q6s16);

-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)

-            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)

-            // -----------------------------------------

-            // BLOCK D: 0-3,4-7

-            // -----------------------------------------

-            // generate 4,5,6,7

-            // part of stage 3

-            LOAD_FROM_TRANSPOSED(6, 4, 28)

-            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)

-            LOAD_FROM_TRANSPOSED(28, 20, 12)

-            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)

-            // part of stage 4

-            q13s16 = vsubq_s16(q0s16, q1s16);

-            q0s16 = vaddq_s16(q0s16, q1s16);

-            q14s16 = vsubq_s16(q2s16, q3s16);

-            q2s16 = vaddq_s16(q2s16, q3s16);

-            // part of stage 5

-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)

-            // generate 0,1,2,3

-            // part of stage 4

-            LOAD_FROM_TRANSPOSED(12, 0, 16)

-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)

-            LOAD_FROM_TRANSPOSED(16, 8, 24)

-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)

-            // part of stage 5

-            q4s16 = vaddq_s16(q7s16, q6s16);

-            q7s16 = vsubq_s16(q7s16, q6s16);

-            q6s16 = vsubq_s16(q5s16, q14s16);

-            q5s16 = vaddq_s16(q5s16, q14s16);

-            // part of stage 6

-            q8s16 = vaddq_s16(q4s16, q2s16);

-            q9s16 = vaddq_s16(q5s16, q3s16);

-            q10s16 = vaddq_s16(q6s16, q1s16);

-            q11s16 = vaddq_s16(q7s16, q0s16);

-            q12s16 = vsubq_s16(q7s16, q0s16);

-            q13s16 = vsubq_s16(q6s16, q1s16);

-            q14s16 = vsubq_s16(q5s16, q3s16);

-            q15s16 = vsubq_s16(q4s16, q2s16);

-            // part of stage 7

-            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)

-            q2s16 = vaddq_s16(q8s16, q1s16);

-            q3s16 = vaddq_s16(q9s16, q0s16);

-            q4s16 = vsubq_s16(q9s16, q0s16);

-            q5s16 = vsubq_s16(q8s16, q1s16);

-            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)

-            q8s16 = vaddq_s16(q4s16, q1s16);

-            q9s16 = vaddq_s16(q5s16, q0s16);

-            q6s16 = vsubq_s16(q5s16, q0s16);

-            q7s16 = vsubq_s16(q4s16, q1s16);

-            if (idct32_pass_loop == 0) {

-                idct32_bands_end_1st_pass(out,

-                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,

-                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);

-            } else {

-                idct32_bands_end_2nd_pass(out, dest, stride,

-                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,

-                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);

-                dest += 8;

-            }

-        }

-    }

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm

+++ /dev/null

@@ -1,1299 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-;TODO(cd): adjust these constant to be able to use vqdmulh for faster

-;          dct_const_round_shift(a * b) within butterfly calculations.

-cospi_1_64  EQU 16364

-cospi_2_64  EQU 16305

-cospi_3_64  EQU 16207

-cospi_4_64  EQU 16069

-cospi_5_64  EQU 15893

-cospi_6_64  EQU 15679

-cospi_7_64  EQU 15426

-cospi_8_64  EQU 15137

-cospi_9_64  EQU 14811

-cospi_10_64 EQU 14449

-cospi_11_64 EQU 14053

-cospi_12_64 EQU 13623

-cospi_13_64 EQU 13160

-cospi_14_64 EQU 12665

-cospi_15_64 EQU 12140

-cospi_16_64 EQU 11585

-cospi_17_64 EQU 11003

-cospi_18_64 EQU 10394

-cospi_19_64 EQU  9760

-cospi_20_64 EQU  9102

-cospi_21_64 EQU  8423

-cospi_22_64 EQU  7723

-cospi_23_64 EQU  7005

-cospi_24_64 EQU  6270

-cospi_25_64 EQU  5520

-cospi_26_64 EQU  4756

-cospi_27_64 EQU  3981

-cospi_28_64 EQU  3196

-cospi_29_64 EQU  2404

-cospi_30_64 EQU  1606

-cospi_31_64 EQU   804

-    EXPORT  |vp9_idct32x32_1024_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-    AREA     Block, CODE, READONLY

-    ; --------------------------------------------------------------------------

-    ; Load from transposed_buffer

-    ;   q13 = transposed_buffer[first_offset]

-    ;   q14 = transposed_buffer[second_offset]

-    ;   for proper address calculation, the last offset used when manipulating

-    ;   transposed_buffer must be passed in. use 0 for first use.

-    MACRO

-    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset

-    ; address calculation with proper stride and loading

-    add r0, #($first_offset  - $prev_offset )*8*2

-    vld1.s16        {q14}, [r0]

-    add r0, #($second_offset - $first_offset)*8*2

-    vld1.s16        {q13}, [r0]

-    ; (used) two registers (q14, q13)

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Load from output (used as temporary storage)

-    ;   reg1 = output[first_offset]

-    ;   reg2 = output[second_offset]

-    ;   for proper address calculation, the last offset used when manipulating

-    ;   output, whether reading or storing) must be passed in. use 0 for first

-    ;   use.

-    MACRO

-    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

-    ; address calculation with proper stride and loading

-    add r1, #($first_offset  - $prev_offset )*32*2

-    vld1.s16        {$reg1}, [r1]

-    add r1, #($second_offset - $first_offset)*32*2

-    vld1.s16        {$reg2}, [r1]

-    ; (used) two registers ($reg1, $reg2)

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Store into output (sometimes as as temporary storage)

-    ;   output[first_offset] = reg1

-    ;   output[second_offset] = reg2

-    ;   for proper address calculation, the last offset used when manipulating

-    ;   output, whether reading or storing) must be passed in. use 0 for first

-    ;   use.

-    MACRO

-    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

-    ; address calculation with proper stride and storing

-    add r1, #($first_offset  - $prev_offset )*32*2

-    vst1.16 {$reg1}, [r1]

-    add r1, #($second_offset - $first_offset)*32*2

-    vst1.16 {$reg2}, [r1]

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Combine-add results with current destination content

-    ;   q6-q9 contain the results (out[j * 32 + 0-31])

-    MACRO

-    STORE_COMBINE_CENTER_RESULTS

-    ; load dest[j * dest_stride + 0-31]

-    vld1.s16        {d8}, [r10], r2

-    vld1.s16        {d11}, [r9], r11

-    vld1.s16        {d9}, [r10]

-    vld1.s16        {d10}, [r9]

-    ; ROUND_POWER_OF_TWO

-    vrshr.s16       q7, q7, #6

-    vrshr.s16       q8, q8, #6

-    vrshr.s16       q9, q9, #6

-    vrshr.s16       q6, q6, #6

-    ; add to dest[j * dest_stride + 0-31]

-    vaddw.u8        q7, q7, d9

-    vaddw.u8        q8, q8, d10

-    vaddw.u8        q9, q9, d11

-    vaddw.u8        q6, q6, d8

-    ; clip pixel

-    vqmovun.s16     d9,  q7

-    vqmovun.s16     d10, q8

-    vqmovun.s16     d11, q9

-    vqmovun.s16     d8,  q6

-    ; store back into dest[j * dest_stride + 0-31]

-    vst1.16         {d9}, [r10], r11

-    vst1.16         {d10}, [r9], r2

-    vst1.16         {d8}, [r10]

-    vst1.16         {d11}, [r9]

-    ; update pointers (by dest_stride * 2)

-    sub r9,  r9,  r2, lsl #1

-    add r10, r10, r2, lsl #1

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Combine-add results with current destination content

-    ;   q6-q9 contain the results (out[j * 32 + 0-31])

-    MACRO

-    STORE_COMBINE_CENTER_RESULTS_LAST

-    ; load dest[j * dest_stride + 0-31]

-    vld1.s16        {d8}, [r10], r2

-    vld1.s16        {d11}, [r9], r11

-    vld1.s16        {d9}, [r10]

-    vld1.s16        {d10}, [r9]

-    ; ROUND_POWER_OF_TWO

-    vrshr.s16       q7, q7, #6

-    vrshr.s16       q8, q8, #6

-    vrshr.s16       q9, q9, #6

-    vrshr.s16       q6, q6, #6

-    ; add to dest[j * dest_stride + 0-31]

-    vaddw.u8        q7, q7, d9

-    vaddw.u8        q8, q8, d10

-    vaddw.u8        q9, q9, d11

-    vaddw.u8        q6, q6, d8

-    ; clip pixel

-    vqmovun.s16     d9,  q7

-    vqmovun.s16     d10, q8

-    vqmovun.s16     d11, q9

-    vqmovun.s16     d8,  q6

-    ; store back into dest[j * dest_stride + 0-31]

-    vst1.16         {d9}, [r10], r11

-    vst1.16         {d10}, [r9], r2

-    vst1.16         {d8}, [r10]!

-    vst1.16         {d11}, [r9]!

-    ; update pointers (by dest_stride * 2)

-    sub r9,  r9,  r2, lsl #1

-    add r10, r10, r2, lsl #1

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Combine-add results with current destination content

-    ;   q4-q7 contain the results (out[j * 32 + 0-31])

-    MACRO

-    STORE_COMBINE_EXTREME_RESULTS

-    ; load dest[j * dest_stride + 0-31]

-    vld1.s16        {d4}, [r7], r2

-    vld1.s16        {d7}, [r6], r11

-    vld1.s16        {d5}, [r7]

-    vld1.s16        {d6}, [r6]

-    ; ROUND_POWER_OF_TWO

-    vrshr.s16       q5, q5, #6

-    vrshr.s16       q6, q6, #6

-    vrshr.s16       q7, q7, #6

-    vrshr.s16       q4, q4, #6

-    ; add to dest[j * dest_stride + 0-31]

-    vaddw.u8        q5, q5, d5

-    vaddw.u8        q6, q6, d6

-    vaddw.u8        q7, q7, d7

-    vaddw.u8        q4, q4, d4

-    ; clip pixel

-    vqmovun.s16     d5, q5

-    vqmovun.s16     d6, q6

-    vqmovun.s16     d7, q7

-    vqmovun.s16     d4, q4

-    ; store back into dest[j * dest_stride + 0-31]

-    vst1.16         {d5}, [r7], r11

-    vst1.16         {d6}, [r6], r2

-    vst1.16         {d7}, [r6]

-    vst1.16         {d4}, [r7]

-    ; update pointers (by dest_stride * 2)

-    sub r6, r6, r2, lsl #1

-    add r7, r7, r2, lsl #1

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Combine-add results with current destination content

-    ;   q4-q7 contain the results (out[j * 32 + 0-31])

-    MACRO

-    STORE_COMBINE_EXTREME_RESULTS_LAST

-    ; load dest[j * dest_stride + 0-31]

-    vld1.s16        {d4}, [r7], r2

-    vld1.s16        {d7}, [r6], r11

-    vld1.s16        {d5}, [r7]

-    vld1.s16        {d6}, [r6]

-    ; ROUND_POWER_OF_TWO

-    vrshr.s16       q5, q5, #6

-    vrshr.s16       q6, q6, #6

-    vrshr.s16       q7, q7, #6

-    vrshr.s16       q4, q4, #6

-    ; add to dest[j * dest_stride + 0-31]

-    vaddw.u8        q5, q5, d5

-    vaddw.u8        q6, q6, d6

-    vaddw.u8        q7, q7, d7

-    vaddw.u8        q4, q4, d4

-    ; clip pixel

-    vqmovun.s16     d5, q5

-    vqmovun.s16     d6, q6

-    vqmovun.s16     d7, q7

-    vqmovun.s16     d4, q4

-    ; store back into dest[j * dest_stride + 0-31]

-    vst1.16         {d5}, [r7], r11

-    vst1.16         {d6}, [r6], r2

-    vst1.16         {d7}, [r6]!

-    vst1.16         {d4}, [r7]!

-    ; update pointers (by dest_stride * 2)

-    sub r6, r6, r2, lsl #1

-    add r7, r7, r2, lsl #1

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Touches q8-q12, q15 (q13-q14 are preserved)

-    ; valid output registers are anything but q8-q11

-    MACRO

-    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

-    ; TODO(cd): have special case to re-use constants when they are similar for

-    ;           consecutive butterflies

-    ; TODO(cd): have special case when both constants are the same, do the

-    ;           additions/subtractions before the multiplies.

-    ; generate the constants

-    ;   generate scalar constants

-    mov             r8,  #$first_constant  & 0xFF00

-    mov             r12, #$second_constant & 0xFF00

-    add             r8,  #$first_constant  & 0x00FF

-    add             r12, #$second_constant & 0x00FF

-    ;   generate vector constants

-    vdup.16         d30, r8

-    vdup.16         d31, r12

-    ; (used) two for inputs (regA-regD), one for constants (q15)

-    ; do some multiplications (ordered for maximum latency hiding)

-    vmull.s16 q8,  $regC, d30

-    vmull.s16 q10, $regA, d31

-    vmull.s16 q9,  $regD, d30

-    vmull.s16 q11, $regB, d31

-    vmull.s16 q12, $regC, d31

-    ; (used) five for intermediate (q8-q12), one for constants (q15)

-    ; do some addition/subtractions (to get back two register)

-    vsub.s32  q8, q8, q10

-    vsub.s32  q9, q9, q11

-    ; do more multiplications (ordered for maximum latency hiding)

-    vmull.s16 q10, $regD, d31

-    vmull.s16 q11, $regA, d30

-    vmull.s16 q15, $regB, d30

-    ; (used) six for intermediate (q8-q12, q15)

-    ; do more addition/subtractions

-    vadd.s32  q11, q12, q11

-    vadd.s32  q10, q10, q15

-    ; (used) four for intermediate (q8-q11)

-    ; dct_const_round_shift

-    vqrshrn.s32 $reg1, q8,  #14

-    vqrshrn.s32 $reg2, q9,  #14

-    vqrshrn.s32 $reg3, q11, #14

-    vqrshrn.s32 $reg4, q10, #14

-    ; (used) two for results, well four d registers

-    MEND

-    ; --------------------------------------------------------------------------

-    ; Touches q8-q12, q15 (q13-q14 are preserved)

-    ; valid output registers are anything but q8-q11

-    MACRO

-    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

-    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

-    MEND

-    ; --------------------------------------------------------------------------

-;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);

-;

-;   r0  int16_t *input,

-;   r1  uint8_t *dest,

-;   r2  int dest_stride)

-; loop counters

-;   r4  bands loop counter

-;   r5  pass loop counter

-;   r8  transpose loop counter

-; combine-add pointers

-;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)

-;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)

-;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)

-;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)

-|vp9_idct32x32_1024_add_neon| PROC

-    ; This function does one pass of idct32x32 transform.

-    ;

-    ; This is done by transposing the input and then doing a 1d transform on

-    ; columns. In the first pass, the transposed columns are the original

-    ; rows. In the second pass, after the transposition, the colums are the

-    ; original columns.

-    ; The 1d transform is done by looping over bands of eight columns (the

-    ; idct32_bands loop). For each band, the transform input transposition

-    ; is done on demand, one band of four 8x8 matrices at a time. The four

-    ; matrices are transposed by pairs (the idct32_transpose_pair loop).

-    push  {r4-r11}

-    vpush {d8-d15}

-    ; stack operation

-    ; internal buffer used to transpose 8 lines into before transforming them

-    ;   int16_t transpose_buffer[32 * 8];

-    ;   at sp + [4096, 4607]

-    ; results of the first pass (transpose and transform rows)

-    ;   int16_t pass1[32 * 32];

-    ;   at sp + [0, 2047]

-    ; results of the second pass (transpose and transform columns)

-    ;   int16_t pass2[32 * 32];

-    ;   at sp + [2048, 4095]

-    sub sp, sp, #512+2048+2048

-    ; r6  = dest + 31 * dest_stride

-    ; r7  = dest +  0 * dest_stride

-    ; r9  = dest + 15 * dest_stride

-    ; r10 = dest + 16 * dest_stride

-    rsb r6,  r2, r2, lsl #5

-    rsb r9,  r2, r2, lsl #4

-    add r10, r1, r2, lsl #4

-    mov r7, r1

-    add r6, r6, r1

-    add r9, r9, r1

-    ; r11 = -dest_stride

-    neg r11, r2

-    ; r3 = input

-    mov r3, r0

-    ; parameters for first pass

-      ; r0 = transpose_buffer[32 * 8]

-    add r0, sp, #4096

-      ; r1 = pass1[32 * 32]

-    mov r1, sp

-    mov r5, #0          ; initialize pass loop counter

-idct32_pass_loop

-    mov r4, #4          ; initialize bands loop counter

-idct32_bands_loop

-    mov r8, #2          ; initialize transpose loop counter

-idct32_transpose_pair_loop

-    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one

-    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,

-    ; adjusted to 32 because of the two post-increments.

-    vld1.s16        {q8},  [r3]!

-    vld1.s16        {q0},  [r3]!

-    add r3, #32

-    vld1.s16        {q9},  [r3]!

-    vld1.s16        {q1},  [r3]!

-    add r3, #32

-    vld1.s16        {q10}, [r3]!

-    vld1.s16        {q2},  [r3]!

-    add r3, #32

-    vld1.s16        {q11}, [r3]!

-    vld1.s16        {q3},  [r3]!

-    add r3, #32

-    vld1.s16        {q12}, [r3]!

-    vld1.s16        {q4},  [r3]!

-    add r3, #32

-    vld1.s16        {q13}, [r3]!

-    vld1.s16        {q5},  [r3]!

-    add r3, #32

-    vld1.s16        {q14}, [r3]!

-    vld1.s16        {q6},  [r3]!

-    add r3, #32

-    vld1.s16        {q15}, [r3]!

-    vld1.s16        {q7},  [r3]!

-    ; Transpose the two 8x8 16bit data matrices.

-    vswp            d17, d24

-    vswp            d23, d30

-    vswp            d21, d28

-    vswp            d19, d26

-    vswp            d1,  d8

-    vswp            d7,  d14

-    vswp            d5,  d12

-    vswp            d3,  d10

-    vtrn.32         q8,  q10

-    vtrn.32         q9,  q11

-    vtrn.32         q12, q14

-    vtrn.32         q13, q15

-    vtrn.32         q0,  q2

-    vtrn.32         q1,  q3

-    vtrn.32         q4,  q6

-    vtrn.32         q5,  q7

-    vtrn.16         q8,  q9

-    vtrn.16         q10, q11

-    vtrn.16         q12, q13

-    vtrn.16         q14, q15

-    vtrn.16         q0,  q1

-    vtrn.16         q2,  q3

-    vtrn.16         q4,  q5

-    vtrn.16         q6,  q7

-    ; Store both matrices after each other. There is a stride of 32, which

-    ; adjusts to nothing because of the post-increments.

-    vst1.16        {q8},  [r0]!

-    vst1.16        {q9},  [r0]!

-    vst1.16        {q10}, [r0]!

-    vst1.16        {q11}, [r0]!

-    vst1.16        {q12}, [r0]!

-    vst1.16        {q13}, [r0]!

-    vst1.16        {q14}, [r0]!

-    vst1.16        {q15}, [r0]!

-    vst1.16        {q0},  [r0]!

-    vst1.16        {q1},  [r0]!

-    vst1.16        {q2},  [r0]!

-    vst1.16        {q3},  [r0]!

-    vst1.16        {q4},  [r0]!

-    vst1.16        {q5},  [r0]!

-    vst1.16        {q6},  [r0]!

-    vst1.16        {q7},  [r0]!

-    ; increment pointers by adjusted stride (not necessary for r0/out)

-    ;   go back by 7*32 for the seven lines moved fully by read and add

-    ;   go back by 32 for the eigth line only read

-    ;   advance by 16*2 to go the next pair

-    sub r3,  r3,  #7*32*2 + 32 - 16*2

-    ; transpose pair loop processing

-    subs r8, r8, #1

-    bne idct32_transpose_pair_loop

-    ; restore r0/input to its original value

-    sub r0, r0, #32*8*2

-    ; Instead of doing the transforms stage by stage, it is done by loading

-    ; some input values and doing as many stages as possible to minimize the

-    ; storing/loading of intermediate results. To fit within registers, the

-    ; final coefficients are cut into four blocks:

-    ; BLOCK A: 16-19,28-31

-    ; BLOCK B: 20-23,24-27

-    ; BLOCK C: 8-10,11-15

-    ; BLOCK D: 0-3,4-7

-    ; Blocks A and C are straight calculation through the various stages. In

-    ; block B, further calculations are performed using the results from

-    ; block A. In block D, further calculations are performed using the results

-    ; from block C and then the final calculations are done using results from

-    ; block A and B which have been combined at the end of block B.

-    ; --------------------------------------------------------------------------

-    ; BLOCK A: 16-19,28-31

-    ; --------------------------------------------------------------------------

-    ; generate 16,17,30,31

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;

-    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;

-    ;step1b[16][i] = dct_const_round_shift(temp1);

-    ;step1b[31][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 0, 1, 31

-    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;

-    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;

-    ;step1b[17][i] = dct_const_round_shift(temp1);

-    ;step1b[30][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 31, 17, 15

-    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;step2[16] =  step1b[16][i] + step1b[17][i];

-    ;step2[17] =  step1b[16][i] - step1b[17][i];

-    ;step2[30] = -step1b[30][i] + step1b[31][i];

-    ;step2[31] =  step1b[30][i] + step1b[31][i];

-    vadd.s16  q4, q0, q1

-    vsub.s16  q13, q0, q1

-    vadd.s16  q6, q2, q3

-    vsub.s16  q14, q2, q3

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;

-    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;

-    ;step3[17] = dct_const_round_shift(temp1);

-    ;step3[30] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15

-    ; --------------------------------------------------------------------------

-    ; generate 18,19,28,29

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;

-    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;

-    ;step1b[18][i] = dct_const_round_shift(temp1);

-    ;step1b[29][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 15, 9, 23

-    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;

-    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;

-    ;step1b[19][i] = dct_const_round_shift(temp1);

-    ;step1b[28][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 23, 25, 7

-    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;step2[18] = -step1b[18][i] + step1b[19][i];

-    ;step2[19] =  step1b[18][i] + step1b[19][i];

-    ;step2[28] =  step1b[28][i] + step1b[29][i];

-    ;step2[29] =  step1b[28][i] - step1b[29][i];

-    vsub.s16  q13, q3, q2

-    vadd.s16  q3,  q3, q2

-    vsub.s16  q14, q1, q0

-    vadd.s16  q2,  q1, q0

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);

-    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);

-    ;step3[29] = dct_const_round_shift(temp1);

-    ;step3[18] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1

-    ; --------------------------------------------------------------------------

-    ; combine 16-19,28-31

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;step1[16] = step1b[16][i] + step1b[19][i];

-    ;step1[17] = step1b[17][i] + step1b[18][i];

-    ;step1[18] = step1b[17][i] - step1b[18][i];

-    ;step1[29] = step1b[30][i] - step1b[29][i];

-    ;step1[30] = step1b[30][i] + step1b[29][i];

-    ;step1[31] = step1b[31][i] + step1b[28][i];

-    vadd.s16  q8,  q4, q2

-    vadd.s16  q9,  q5, q0

-    vadd.s16  q10, q7, q1

-    vadd.s16  q15, q6, q3

-    vsub.s16  q13, q5, q0

-    vsub.s16  q14, q7, q1

-    STORE_IN_OUTPUT 0,  16, 31, q8,  q15

-    STORE_IN_OUTPUT 31, 17, 30, q9,  q10

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;

-    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;

-    ;step2[18] = dct_const_round_shift(temp1);

-    ;step2[29] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3

-    STORE_IN_OUTPUT 30, 29, 18, q1, q0

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;step1[19] = step1b[16][i] - step1b[19][i];

-    ;step1[28] = step1b[31][i] - step1b[28][i];

-    vsub.s16  q13, q4, q2

-    vsub.s16  q14, q6, q3

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;

-    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;

-    ;step2[19] = dct_const_round_shift(temp1);

-    ;step2[28] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13

-    STORE_IN_OUTPUT 18, 19, 28, q4, q6

-    ; --------------------------------------------------------------------------

-    ; --------------------------------------------------------------------------

-    ; BLOCK B: 20-23,24-27

-    ; --------------------------------------------------------------------------

-    ; generate 20,21,26,27

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;

-    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;

-    ;step1b[20][i] = dct_const_round_shift(temp1);

-    ;step1b[27][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 7, 5, 27

-    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;

-    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;

-    ;step1b[21][i] = dct_const_round_shift(temp1);

-    ;step1b[26][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 27, 21, 11

-    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;step2[20] =  step1b[20][i] + step1b[21][i];

-    ;step2[21] =  step1b[20][i] - step1b[21][i];

-    ;step2[26] = -step1b[26][i] + step1b[27][i];

-    ;step2[27] =  step1b[26][i] + step1b[27][i];

-    vsub.s16  q13, q0, q1

-    vadd.s16  q0, q0, q1

-    vsub.s16  q14, q2, q3

-    vadd.s16  q2, q2, q3

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;

-    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;

-    ;step3[21] = dct_const_round_shift(temp1);

-    ;step3[26] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; generate 22,23,24,25

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;

-    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;

-    ;step1b[22][i] = dct_const_round_shift(temp1);

-    ;step1b[25][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 11, 13, 19

-    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15

-    ; --------------------------------------------------------------------------

-    ; part of stage 1

-    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;

-    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;

-    ;step1b[23][i] = dct_const_round_shift(temp1);

-    ;step1b[24][i] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 19, 29, 3

-    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;step2[22] = -step1b[22][i] + step1b[23][i];

-    ;step2[23] =  step1b[22][i] + step1b[23][i];

-    ;step2[24] =  step1b[24][i] + step1b[25][i];

-    ;step2[25] =  step1b[24][i] - step1b[25][i];

-    vsub.s16  q14, q4, q5

-    vadd.s16  q5, q4, q5

-    vsub.s16  q13, q6, q7

-    vadd.s16  q6, q6, q7

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);

-    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);

-    ;step3[25] = dct_const_round_shift(temp1);

-    ;step3[22] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15

-    ; --------------------------------------------------------------------------

-    ; combine 20-23,24-27

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;step1[22] = step1b[22][i] + step1b[21][i];

-    ;step1[23] = step1b[23][i] + step1b[20][i];

-    vadd.s16  q10, q7, q1

-    vadd.s16  q11, q5, q0

-    ;step1[24] = step1b[24][i] + step1b[27][i];

-    ;step1[25] = step1b[25][i] + step1b[26][i];

-    vadd.s16  q12, q6, q2

-    vadd.s16  q15, q4, q3

-    ; --------------------------------------------------------------------------

-    ; part of stage 6

-    ;step3[16] = step1b[16][i] + step1b[23][i];

-    ;step3[17] = step1b[17][i] + step1b[22][i];

-    ;step3[22] = step1b[17][i] - step1b[22][i];

-    ;step3[23] = step1b[16][i] - step1b[23][i];

-    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13

-    vadd.s16  q8,  q14, q11

-    vadd.s16  q9,  q13, q10

-    vsub.s16  q13, q13, q10

-    vsub.s16  q11, q14, q11

-    STORE_IN_OUTPUT 17, 17, 16, q9, q8

-    ; --------------------------------------------------------------------------

-    ; part of stage 6

-    ;step3[24] = step1b[31][i] - step1b[24][i];

-    ;step3[25] = step1b[30][i] - step1b[25][i];

-    ;step3[30] = step1b[30][i] + step1b[25][i];

-    ;step3[31] = step1b[31][i] + step1b[24][i];

-    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9

-    vsub.s16  q8,  q9,  q12

-    vadd.s16  q10, q14, q15

-    vsub.s16  q14, q14, q15

-    vadd.s16  q12, q9,  q12

-    STORE_IN_OUTPUT 31, 30, 31, q10, q12

-    ; --------------------------------------------------------------------------

-    ; TODO(cd) do some register allocation change to remove these push/pop

-    vpush {q8}  ; [24]

-    vpush {q11} ; [23]

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;

-    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;

-    ;step1[22] = dct_const_round_shift(temp1);

-    ;step1[25] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

-    STORE_IN_OUTPUT 31, 25, 22, q14, q13

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;

-    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;

-    ;step1[23] = dct_const_round_shift(temp1);

-    ;step1[24] = dct_const_round_shift(temp2);

-    ; TODO(cd) do some register allocation change to remove these push/pop

-    vpop  {q13} ; [23]

-    vpop  {q14} ; [24]

-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

-    STORE_IN_OUTPUT 22, 24, 23, q14, q13

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;step1[20] = step1b[23][i] - step1b[20][i];

-    ;step1[27] = step1b[24][i] - step1b[27][i];

-    vsub.s16  q14, q5, q0

-    vsub.s16  q13, q6, q2

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);

-    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);

-    ;step2[27] = dct_const_round_shift(temp1);

-    ;step2[20] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;step1[21] = step1b[22][i] - step1b[21][i];

-    ;step1[26] = step1b[25][i] - step1b[26][i];

-    vsub.s16  q14,  q7, q1

-    vsub.s16  q13,  q4, q3

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);

-    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);

-    ;step2[26] = dct_const_round_shift(temp1);

-    ;step2[21] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3

-    ; --------------------------------------------------------------------------

-    ; part of stage 6

-    ;step3[18] = step1b[18][i] + step1b[21][i];

-    ;step3[19] = step1b[19][i] + step1b[20][i];

-    ;step3[20] = step1b[19][i] - step1b[20][i];

-    ;step3[21] = step1b[18][i] - step1b[21][i];

-    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13

-    vadd.s16  q8,  q14, q1

-    vadd.s16  q9,  q13, q6

-    vsub.s16  q13, q13, q6

-    vsub.s16  q1,  q14, q1

-    STORE_IN_OUTPUT 19, 18, 19, q8, q9

-    ; --------------------------------------------------------------------------

-    ; part of stage 6

-    ;step3[27] = step1b[28][i] - step1b[27][i];

-    ;step3[28] = step1b[28][i] + step1b[27][i];

-    ;step3[29] = step1b[29][i] + step1b[26][i];

-    ;step3[26] = step1b[29][i] - step1b[26][i];

-    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9

-    vsub.s16  q14, q8, q5

-    vadd.s16  q10, q8, q5

-    vadd.s16  q11, q9, q0

-    vsub.s16  q0, q9, q0

-    STORE_IN_OUTPUT 29, 28, 29, q10, q11

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;

-    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;

-    ;step1[20] = dct_const_round_shift(temp1);

-    ;step1[27] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

-    STORE_IN_OUTPUT 29, 20, 27, q13, q14

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;

-    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;

-    ;step1[21] = dct_const_round_shift(temp1);

-    ;step1[26] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1

-    STORE_IN_OUTPUT 27, 21, 26, q1, q0

-    ; --------------------------------------------------------------------------

-    ; --------------------------------------------------------------------------

-    ; BLOCK C: 8-10,11-15

-    ; --------------------------------------------------------------------------

-    ; generate 8,9,14,15

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;

-    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;

-    ;step2[8] = dct_const_round_shift(temp1);

-    ;step2[15] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 3, 2, 30

-    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;

-    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;

-    ;step2[9] = dct_const_round_shift(temp1);

-    ;step2[14] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 30, 18, 14

-    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;step3[8] = step1b[8][i] + step1b[9][i];

-    ;step3[9] = step1b[8][i] - step1b[9][i];

-    ;step3[14] = step1b[15][i] - step1b[14][i];

-    ;step3[15] = step1b[15][i] + step1b[14][i];

-    vsub.s16  q13, q0, q1

-    vadd.s16  q0, q0, q1

-    vsub.s16  q14, q2, q3

-    vadd.s16  q2, q2, q3

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;

-    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;

-    ;step1[9]  = dct_const_round_shift(temp1);

-    ;step1[14] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; generate 10,11,12,13

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;

-    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;

-    ;step2[10] = dct_const_round_shift(temp1);

-    ;step2[13] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 14, 10, 22

-    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15

-    ; --------------------------------------------------------------------------

-    ; part of stage 2

-    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;

-    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;

-    ;step2[11] = dct_const_round_shift(temp1);

-    ;step2[12] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 22, 26, 6

-    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;step3[10] = step1b[11][i] - step1b[10][i];

-    ;step3[11] = step1b[11][i] + step1b[10][i];

-    ;step3[12] = step1b[12][i] + step1b[13][i];

-    ;step3[13] = step1b[12][i] - step1b[13][i];

-    vsub.s16  q14, q4, q5

-    vadd.s16  q5, q4, q5

-    vsub.s16  q13, q6, q7

-    vadd.s16  q6, q6, q7

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);

-    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);

-    ;step1[13] = dct_const_round_shift(temp1);

-    ;step1[10] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15

-    ; --------------------------------------------------------------------------

-    ; combine 8-10,11-15

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;step2[8]  = step1b[8][i] + step1b[11][i];

-    ;step2[9]  = step1b[9][i] + step1b[10][i];

-    ;step2[10] = step1b[9][i] - step1b[10][i];

-    vadd.s16  q8,  q0, q5

-    vadd.s16  q9,  q1, q7

-    vsub.s16  q13, q1, q7

-    ;step2[13] = step1b[14][i] - step1b[13][i];

-    ;step2[14] = step1b[14][i] + step1b[13][i];

-    ;step2[15] = step1b[15][i] + step1b[12][i];

-    vsub.s16  q14, q3, q4

-    vadd.s16  q10, q3, q4

-    vadd.s16  q15, q2, q6

-    STORE_IN_OUTPUT 26, 8, 15, q8, q15

-    STORE_IN_OUTPUT 15, 9, 14, q9, q10

-    ; --------------------------------------------------------------------------

-    ; part of stage 6

-    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;

-    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;

-    ;step3[10] = dct_const_round_shift(temp1);

-    ;step3[13] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

-    STORE_IN_OUTPUT 14, 13, 10, q3, q1

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;step2[11] = step1b[8][i] - step1b[11][i];

-    ;step2[12] = step1b[15][i] - step1b[12][i];

-    vsub.s16  q13, q0, q5

-    vsub.s16  q14,  q2, q6

-    ; --------------------------------------------------------------------------

-    ; part of stage 6

-    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;

-    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;

-    ;step3[11] = dct_const_round_shift(temp1);

-    ;step3[12] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

-    STORE_IN_OUTPUT 10, 11, 12, q1, q3

-    ; --------------------------------------------------------------------------

-    ; --------------------------------------------------------------------------

-    ; BLOCK D: 0-3,4-7

-    ; --------------------------------------------------------------------------

-    ; generate 4,5,6,7

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;

-    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;

-    ;step3[4] = dct_const_round_shift(temp1);

-    ;step3[7] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 6, 4, 28

-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5

-    ; --------------------------------------------------------------------------

-    ; part of stage 3

-    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;

-    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;

-    ;step3[5] = dct_const_round_shift(temp1);

-    ;step3[6] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 28, 20, 12

-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;step1[4] = step1b[4][i] + step1b[5][i];

-    ;step1[5] = step1b[4][i] - step1b[5][i];

-    ;step1[6] = step1b[7][i] - step1b[6][i];

-    ;step1[7] = step1b[7][i] + step1b[6][i];

-    vsub.s16  q13, q0, q1

-    vadd.s16  q0, q0, q1

-    vsub.s16  q14, q2, q3

-    vadd.s16  q2, q2, q3

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;

-    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;

-    ;step2[5] = dct_const_round_shift(temp1);

-    ;step2[6] = dct_const_round_shift(temp2);

-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

-    ; --------------------------------------------------------------------------

-    ; generate 0,1,2,3

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;

-    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;

-    ;step1[1] = dct_const_round_shift(temp1);

-    ;step1[0] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 12, 0, 16

-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15

-    ; --------------------------------------------------------------------------

-    ; part of stage 4

-    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;

-    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;

-    ;step1[2] = dct_const_round_shift(temp1);

-    ;step1[3] = dct_const_round_shift(temp2);

-    LOAD_FROM_TRANSPOSED 16, 8, 24

-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13

-    ; --------------------------------------------------------------------------

-    ; part of stage 5

-    ;step2[0] = step1b[0][i] + step1b[3][i];

-    ;step2[1] = step1b[1][i] + step1b[2][i];

-    ;step2[2] = step1b[1][i] - step1b[2][i];

-    ;step2[3] = step1b[0][i] - step1b[3][i];

-    vadd.s16  q4, q7, q6

-    vsub.s16  q7, q7, q6

-    vsub.s16  q6, q5, q14

-    vadd.s16  q5, q5, q14

-    ; --------------------------------------------------------------------------

-    ; combine 0-3,4-7

-    ; --------------------------------------------------------------------------

-    ; part of stage 6

-    ;step3[0] = step1b[0][i] + step1b[7][i];

-    ;step3[1] = step1b[1][i] + step1b[6][i];

-    ;step3[2] = step1b[2][i] + step1b[5][i];

-    ;step3[3] = step1b[3][i] + step1b[4][i];

-    vadd.s16  q8,  q4, q2

-    vadd.s16  q9,  q5, q3

-    vadd.s16  q10, q6, q1

-    vadd.s16  q11, q7, q0

-    ;step3[4] = step1b[3][i] - step1b[4][i];

-    ;step3[5] = step1b[2][i] - step1b[5][i];

-    ;step3[6] = step1b[1][i] - step1b[6][i];

-    ;step3[7] = step1b[0][i] - step1b[7][i];

-    vsub.s16  q12, q7, q0

-    vsub.s16  q13, q6, q1

-    vsub.s16  q14, q5, q3

-    vsub.s16  q15, q4, q2

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;step1[0] = step1b[0][i] + step1b[15][i];

-    ;step1[1] = step1b[1][i] + step1b[14][i];

-    ;step1[14] = step1b[1][i] - step1b[14][i];

-    ;step1[15] = step1b[0][i] - step1b[15][i];

-    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1

-    vadd.s16  q2, q8, q1

-    vadd.s16  q3, q9, q0

-    vsub.s16  q4, q9, q0

-    vsub.s16  q5, q8, q1

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[14 * 32] = step1b[14][i] + step1b[17][i];

-    ;output[15 * 32] = step1b[15][i] + step1b[16][i];

-    ;output[16 * 32] = step1b[15][i] - step1b[16][i];

-    ;output[17 * 32] = step1b[14][i] - step1b[17][i];

-    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1

-    vadd.s16  q8, q4, q1

-    vadd.s16  q9, q5, q0

-    vsub.s16  q6, q5, q0

-    vsub.s16  q7, q4, q1

-    cmp r5, #0

-    bgt idct32_bands_end_2nd_pass

-idct32_bands_end_1st_pass

-    STORE_IN_OUTPUT 17, 16, 17, q6, q7

-    STORE_IN_OUTPUT 17, 14, 15, q8, q9

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];

-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];

-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];

-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];

-    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_IN_OUTPUT 31, 30, 31, q6, q7

-    STORE_IN_OUTPUT 31,  0,  1, q4, q5

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;step1[2] = step1b[2][i] + step1b[13][i];

-    ;step1[3] = step1b[3][i] + step1b[12][i];

-    ;step1[12] = step1b[3][i] - step1b[12][i];

-    ;step1[13] = step1b[2][i] - step1b[13][i];

-    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1

-    vadd.s16  q2, q10, q1

-    vadd.s16  q3, q11, q0

-    vsub.s16  q4, q11, q0

-    vsub.s16  q5, q10, q1

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];

-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];

-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];

-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];

-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1

-    vadd.s16  q8, q4, q1

-    vadd.s16  q9, q5, q0

-    vsub.s16  q6, q5, q0

-    vsub.s16  q7, q4, q1

-    STORE_IN_OUTPUT 19, 18, 19, q6, q7

-    STORE_IN_OUTPUT 19, 12, 13, q8, q9

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];

-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];

-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];

-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];

-    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_IN_OUTPUT 29, 28, 29, q6, q7

-    STORE_IN_OUTPUT 29,  2,  3, q4, q5

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;step1[4] = step1b[4][i] + step1b[11][i];

-    ;step1[5] = step1b[5][i] + step1b[10][i];

-    ;step1[10] = step1b[5][i] - step1b[10][i];

-    ;step1[11] = step1b[4][i] - step1b[11][i];

-    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1

-    vadd.s16  q2, q12, q1

-    vadd.s16  q3, q13, q0

-    vsub.s16  q4, q13, q0

-    vsub.s16  q5, q12, q1

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];

-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];

-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];

-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];

-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1

-    vadd.s16  q8, q4, q1

-    vadd.s16  q9, q5, q0

-    vsub.s16  q6, q5, q0

-    vsub.s16  q7, q4, q1

-    STORE_IN_OUTPUT 21, 20, 21, q6, q7

-    STORE_IN_OUTPUT 21, 10, 11, q8, q9

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];

-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];

-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];

-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];

-    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_IN_OUTPUT 27, 26, 27, q6, q7

-    STORE_IN_OUTPUT 27,  4,  5, q4, q5

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;step1[6] = step1b[6][i] + step1b[9][i];

-    ;step1[7] = step1b[7][i] + step1b[8][i];

-    ;step1[8] = step1b[7][i] - step1b[8][i];

-    ;step1[9] = step1b[6][i] - step1b[9][i];

-    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1

-    vadd.s16  q2, q14, q1

-    vadd.s16  q3, q15, q0

-    vsub.s16  q4, q15, q0

-    vsub.s16  q5, q14, q1

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];

-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];

-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];

-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];

-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1

-    vadd.s16  q8, q4, q1

-    vadd.s16  q9, q5, q0

-    vsub.s16  q6, q5, q0

-    vsub.s16  q7, q4, q1

-    STORE_IN_OUTPUT 23, 22, 23, q6, q7

-    STORE_IN_OUTPUT 23, 8, 9, q8, q9

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];

-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];

-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];

-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];

-    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_IN_OUTPUT 25, 24, 25, q6, q7

-    STORE_IN_OUTPUT 25,  6,  7, q4, q5

-    ; restore r0 by removing the last offset from the last

-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2

-    sub r0, r0, #24*8*2

-    ; restore r1 by removing the last offset from the last

-    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2

-    ; advance by 8 columns => 8*2

-    sub r1, r1, #7*32*2 - 8*2

-    ;   advance by 8 lines (8*32*2)

-    ;   go back by the two pairs from the loop (32*2)

-    add r3, r3, #8*32*2 - 32*2

-    ; bands loop processing

-    subs r4, r4, #1

-    bne idct32_bands_loop

-    ; parameters for second pass

-    ; the input of pass2 is the result of pass1. we have to remove the offset

-    ;   of 32 columns induced by the above idct32_bands_loop

-    sub r3, r1, #32*2

-      ; r1 = pass2[32 * 32]

-    add r1, sp, #2048

-    ; pass loop processing

-    add r5, r5, #1

-    b idct32_pass_loop

-idct32_bands_end_2nd_pass

-    STORE_COMBINE_CENTER_RESULTS

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];

-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];

-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];

-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];

-    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_COMBINE_EXTREME_RESULTS

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;step1[2] = step1b[2][i] + step1b[13][i];

-    ;step1[3] = step1b[3][i] + step1b[12][i];

-    ;step1[12] = step1b[3][i] - step1b[12][i];

-    ;step1[13] = step1b[2][i] - step1b[13][i];

-    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1

-    vadd.s16  q2, q10, q1

-    vadd.s16  q3, q11, q0

-    vsub.s16  q4, q11, q0

-    vsub.s16  q5, q10, q1

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];

-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];

-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];

-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];

-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1

-    vadd.s16  q8, q4, q1

-    vadd.s16  q9, q5, q0

-    vsub.s16  q6, q5, q0

-    vsub.s16  q7, q4, q1

-    STORE_COMBINE_CENTER_RESULTS

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];

-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];

-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];

-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];

-    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_COMBINE_EXTREME_RESULTS

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;step1[4] = step1b[4][i] + step1b[11][i];

-    ;step1[5] = step1b[5][i] + step1b[10][i];

-    ;step1[10] = step1b[5][i] - step1b[10][i];

-    ;step1[11] = step1b[4][i] - step1b[11][i];

-    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1

-    vadd.s16  q2, q12, q1

-    vadd.s16  q3, q13, q0

-    vsub.s16  q4, q13, q0

-    vsub.s16  q5, q12, q1

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];

-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];

-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];

-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];

-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1

-    vadd.s16  q8, q4, q1

-    vadd.s16  q9, q5, q0

-    vsub.s16  q6, q5, q0

-    vsub.s16  q7, q4, q1

-    STORE_COMBINE_CENTER_RESULTS

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];

-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];

-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];

-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];

-    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_COMBINE_EXTREME_RESULTS

-    ; --------------------------------------------------------------------------

-    ; part of stage 7

-    ;step1[6] = step1b[6][i] + step1b[9][i];

-    ;step1[7] = step1b[7][i] + step1b[8][i];

-    ;step1[8] = step1b[7][i] - step1b[8][i];

-    ;step1[9] = step1b[6][i] - step1b[9][i];

-    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1

-    vadd.s16  q2, q14, q1

-    vadd.s16  q3, q15, q0

-    vsub.s16  q4, q15, q0

-    vsub.s16  q5, q14, q1

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];

-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];

-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];

-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];

-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1

-    vadd.s16  q8, q4, q1

-    vadd.s16  q9, q5, q0

-    vsub.s16  q6, q5, q0

-    vsub.s16  q7, q4, q1

-    STORE_COMBINE_CENTER_RESULTS_LAST

-    ; --------------------------------------------------------------------------

-    ; part of final stage

-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];

-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];

-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];

-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];

-    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1

-    vadd.s16  q4, q2, q1

-    vadd.s16  q5, q3, q0

-    vsub.s16  q6, q3, q0

-    vsub.s16  q7, q2, q1

-    STORE_COMBINE_EXTREME_RESULTS_LAST

-    ; --------------------------------------------------------------------------

-    ; restore pointers to their initial indices for next band pass by

-    ;     removing/adding dest_stride * 8. The actual increment by eight

-    ;     is taken care of within the _LAST macros.

-    add r6,  r6,  r2, lsl #3

-    add r9,  r9,  r2, lsl #3

-    sub r7,  r7,  r2, lsl #3

-    sub r10, r10, r2, lsl #3

-    ; restore r0 by removing the last offset from the last

-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2

-    sub r0, r0, #24*8*2

-    ; restore r1 by removing the last offset from the last

-    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2

-    ; advance by 8 columns => 8*2

-    sub r1, r1, #25*32*2 - 8*2

-    ;   advance by 8 lines (8*32*2)

-    ;   go back by the two pairs from the loop (32*2)

-    add r3, r3, #8*32*2 - 32*2

-    ; bands loop processing

-    subs r4, r4, #1

-    bne idct32_bands_loop

-    ; stack operation

-    add sp, sp, #512+2048+2048

-    vpop {d8-d15}

-    pop  {r4-r11}

-    bx              lr

-    ENDP  ; |vp9_idct32x32_1024_add_neon|

-    END

--- a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c

+++ /dev/null

@@ -1,50 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "vp9/common/vp9_idct.h"

-#include "vpx_ports/mem.h"

-void vp9_idct4x4_1_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8x8_t d6u8;

-    uint32x2_t d2u32 = vdup_n_u32(0);

-    uint16x8_t q8u16;

-    int16x8_t q0s16;

-    uint8_t *d1, *d2;

-    int16_t i, a1, cospi_16_64 = 11585;

-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-    out = dct_const_round_shift(out * cospi_16_64);

-    a1 = ROUND_POWER_OF_TWO(out, 4);

-    q0s16 = vdupq_n_s16(a1);

-    // dc_only_idct_add

-    d1 = d2 = dest;

-    for (i = 0; i < 2; i++) {

-        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);

-        d1 += dest_stride;

-        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);

-        d1 += dest_stride;

-        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),

-                         vreinterpret_u8_u32(d2u32));

-        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

-        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);

-        d2 += dest_stride;

-        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);

-        d2 += dest_stride;

-    }

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm

+++ /dev/null

@@ -1,68 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |vp9_idct4x4_1_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,

-;                                  int dest_stride)

-;

-; r0  int16_t input

-; r1  uint8_t *dest

-; r2  int dest_stride)

-|vp9_idct4x4_1_add_neon| PROC

-    ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

-    ; out = dct_const_round_shift(input[0] * cospi_16_64)

-    mul              r0, r0, r12               ; input[0] * cospi_16_64

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; out = dct_const_round_shift(out * cospi_16_64)

-    mul              r0, r0, r12               ; out * cospi_16_64

-    mov              r12, r1                   ; save dest

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; a1 = ROUND_POWER_OF_TWO(out, 4)

-    add              r0, r0, #8                ; + (1 <<((4) - 1))

-    asr              r0, r0, #4                ; >> 4

-    vdup.s16         q0, r0                    ; duplicate a1

-    vld1.32          {d2[0]}, [r1], r2

-    vld1.32          {d2[1]}, [r1], r2

-    vld1.32          {d4[0]}, [r1], r2

-    vld1.32          {d4[1]}, [r1]

-    vaddw.u8         q8, q0, d2                ; dest[x] + a1

-    vaddw.u8         q9, q0, d4

-    vqmovun.s16      d6, q8                    ; clip_pixel

-    vqmovun.s16      d7, q9

-    vst1.32          {d6[0]}, [r12], r2

-    vst1.32          {d6[1]}, [r12], r2

-    vst1.32          {d7[0]}, [r12], r2

-    vst1.32          {d7[1]}, [r12]

-    bx               lr

-    ENDP             ; |vp9_idct4x4_1_add_neon|

-    END

--- a/vp9/common/arm/neon/vp9_idct4x4_add_neon.c

+++ /dev/null

@@ -1,151 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-void vp9_idct4x4_16_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8x8_t d26u8, d27u8;

-    uint32x2_t d26u32, d27u32;

-    uint16x8_t q8u16, q9u16;

-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;

-    int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;

-    int16x8_t q8s16, q9s16, q13s16, q14s16;

-    int32x4_t q1s32, q13s32, q14s32, q15s32;

-    int16x4x2_t d0x2s16, d1x2s16;

-    int32x4x2_t q0x2s32;

-    uint8_t *d;

-    int16_t cospi_8_64 = 15137;

-    int16_t cospi_16_64 = 11585;

-    int16_t cospi_24_64 = 6270;

-    d26u32 = d27u32 = vdup_n_u32(0);

-    q8s16 = vld1q_s16(input);

-    q9s16 = vld1q_s16(input + 8);

-    d16s16 = vget_low_s16(q8s16);

-    d17s16 = vget_high_s16(q8s16);

-    d18s16 = vget_low_s16(q9s16);

-    d19s16 = vget_high_s16(q9s16);

-    d0x2s16 = vtrn_s16(d16s16, d17s16);

-    d1x2s16 = vtrn_s16(d18s16, d19s16);

-    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);

-    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

-    d20s16 = vdup_n_s16(cospi_8_64);

-    d21s16 = vdup_n_s16(cospi_16_64);

-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),

-                        vreinterpretq_s32_s16(q9s16));

-    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

-    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

-    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

-    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

-    d22s16 = vdup_n_s16(cospi_24_64);

-    // stage 1

-    d23s16 = vadd_s16(d16s16, d18s16);

-    d24s16 = vsub_s16(d16s16, d18s16);

-    q15s32 = vmull_s16(d17s16, d22s16);

-    q1s32  = vmull_s16(d17s16, d20s16);

-    q13s32 = vmull_s16(d23s16, d21s16);

-    q14s32 = vmull_s16(d24s16, d21s16);

-    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);

-    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);

-    d26s16 = vqrshrn_n_s32(q13s32, 14);

-    d27s16 = vqrshrn_n_s32(q14s32, 14);

-    d29s16 = vqrshrn_n_s32(q15s32, 14);

-    d28s16 = vqrshrn_n_s32(q1s32,  14);

-    q13s16 = vcombine_s16(d26s16, d27s16);

-    q14s16 = vcombine_s16(d28s16, d29s16);

-    // stage 2

-    q8s16 = vaddq_s16(q13s16, q14s16);

-    q9s16 = vsubq_s16(q13s16, q14s16);

-    d16s16 = vget_low_s16(q8s16);

-    d17s16 = vget_high_s16(q8s16);

-    d18s16 = vget_high_s16(q9s16);  // vswp d18 d19

-    d19s16 = vget_low_s16(q9s16);

-    d0x2s16 = vtrn_s16(d16s16, d17s16);

-    d1x2s16 = vtrn_s16(d18s16, d19s16);

-    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);

-    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),

-                        vreinterpretq_s32_s16(q9s16));

-    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

-    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

-    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

-    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

-    // do the transform on columns

-    // stage 1

-    d23s16 = vadd_s16(d16s16, d18s16);

-    d24s16 = vsub_s16(d16s16, d18s16);

-    q15s32 = vmull_s16(d17s16, d22s16);

-    q1s32  = vmull_s16(d17s16, d20s16);

-    q13s32 = vmull_s16(d23s16, d21s16);

-    q14s32 = vmull_s16(d24s16, d21s16);

-    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);

-    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);

-    d26s16 = vqrshrn_n_s32(q13s32, 14);

-    d27s16 = vqrshrn_n_s32(q14s32, 14);

-    d29s16 = vqrshrn_n_s32(q15s32, 14);

-    d28s16 = vqrshrn_n_s32(q1s32,  14);

-    q13s16 = vcombine_s16(d26s16, d27s16);

-    q14s16 = vcombine_s16(d28s16, d29s16);

-    // stage 2

-    q8s16 = vaddq_s16(q13s16, q14s16);

-    q9s16 = vsubq_s16(q13s16, q14s16);

-    q8s16 = vrshrq_n_s16(q8s16, 4);

-    q9s16 = vrshrq_n_s16(q9s16, 4);

-    d = dest;

-    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);

-    d += dest_stride;

-    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);

-    d += dest_stride;

-    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);

-    d += dest_stride;

-    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);

-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

-                     vreinterpret_u8_u32(d26u32));

-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

-                     vreinterpret_u8_u32(d27u32));

-    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

-    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-    d = dest;

-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);

-    d += dest_stride;

-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);

-    d += dest_stride;

-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);

-    d += dest_stride;

-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm

+++ /dev/null

@@ -1,190 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_idct4x4_16_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-    AREA     Block, CODE, READONLY ; name this block of code

-;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

-;

-; r0  int16_t input

-; r1  uint8_t *dest

-; r2  int dest_stride)

-|vp9_idct4x4_16_add_neon| PROC

-    ; The 2D transform is done with two passes which are actually pretty

-    ; similar. We first transform the rows. This is done by transposing

-    ; the inputs, doing an SIMD column transform (the columns are the

-    ; transposed rows) and then transpose the results (so that it goes back

-    ; in normal/row positions). Then, we transform the columns by doing

-    ; another SIMD column transform.

-    ; So, two passes of a transpose followed by a column transform.

-    ; load the inputs into q8-q9, d16-d19

-    vld1.s16        {q8,q9}, [r0]!

-    ; generate scalar constants

-    ; cospi_8_64 = 15137 = 0x3b21

-    mov             r0, #0x3b00

-    add             r0, #0x21

-    ; cospi_16_64 = 11585 = 0x2d41

-    mov             r3, #0x2d00

-    add             r3, #0x41

-    ; cospi_24_64 = 6270 = 0x 187e

-    mov             r12, #0x1800

-    add             r12, #0x7e

-    ; transpose the input data

-    ; 00 01 02 03   d16

-    ; 10 11 12 13   d17

-    ; 20 21 22 23   d18

-    ; 30 31 32 33   d19

-    vtrn.16         d16, d17

-    vtrn.16         d18, d19

-    ; generate constant vectors

-    vdup.16         d20, r0         ; replicate cospi_8_64

-    vdup.16         d21, r3         ; replicate cospi_16_64

-    ; 00 10 02 12   d16

-    ; 01 11 03 13   d17

-    ; 20 30 22 32   d18

-    ; 21 31 23 33   d19

-    vtrn.32         q8, q9

-    ; 00 10 20 30   d16

-    ; 01 11 21 31   d17

-    ; 02 12 22 32   d18

-    ; 03 13 23 33   d19

-    vdup.16         d22, r12        ; replicate cospi_24_64

-    ; do the transform on transposed rows

-    ; stage 1

-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])

-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])

-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64

-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64

-    ; (input[0] + input[2]) * cospi_16_64;

-    ; (input[0] - input[2]) * cospi_16_64;

-    vmull.s16 q13, d23, d21

-    vmull.s16 q14, d24, d21

-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;

-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;

-    vmlsl.s16 q15, d19, d20

-    vmlal.s16 q1,  d19, d22

-    ; dct_const_round_shift

-    vqrshrn.s32 d26, q13, #14

-    vqrshrn.s32 d27, q14, #14

-    vqrshrn.s32 d29, q15, #14

-    vqrshrn.s32 d28, q1,  #14

-    ; stage 2

-    ; output[0] = step[0] + step[3];

-    ; output[1] = step[1] + step[2];

-    ; output[3] = step[0] - step[3];

-    ; output[2] = step[1] - step[2];

-    vadd.s16 q8,  q13, q14

-    vsub.s16 q9,  q13, q14

-    vswp     d18, d19

-    ; transpose the results

-    ; 00 01 02 03   d16

-    ; 10 11 12 13   d17

-    ; 20 21 22 23   d18

-    ; 30 31 32 33   d19

-    vtrn.16         d16, d17

-    vtrn.16         d18, d19

-    ; 00 10 02 12   d16

-    ; 01 11 03 13   d17

-    ; 20 30 22 32   d18

-    ; 21 31 23 33   d19

-    vtrn.32         q8, q9

-    ; 00 10 20 30   d16

-    ; 01 11 21 31   d17

-    ; 02 12 22 32   d18

-    ; 03 13 23 33   d19

-    ; do the transform on columns

-    ; stage 1

-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])

-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])

-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64

-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64

-    ; (input[0] + input[2]) * cospi_16_64;

-    ; (input[0] - input[2]) * cospi_16_64;

-    vmull.s16 q13, d23, d21

-    vmull.s16 q14, d24, d21

-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;

-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;

-    vmlsl.s16 q15, d19, d20

-    vmlal.s16 q1,  d19, d22

-    ; dct_const_round_shift

-    vqrshrn.s32 d26, q13, #14

-    vqrshrn.s32 d27, q14, #14

-    vqrshrn.s32 d29, q15, #14

-    vqrshrn.s32 d28, q1,  #14

-    ; stage 2

-    ; output[0] = step[0] + step[3];

-    ; output[1] = step[1] + step[2];

-    ; output[3] = step[0] - step[3];

-    ; output[2] = step[1] - step[2];

-    vadd.s16 q8,  q13, q14

-    vsub.s16 q9,  q13, q14

-    ; The results are in two registers, one of them being swapped. This will

-    ; be taken care of by loading the 'dest' value in a swapped fashion and

-    ; also storing them in the same swapped fashion.

-    ; temp_out[0, 1] = d16, d17 = q8

-    ; temp_out[2, 3] = d19, d18 = q9 swapped

-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)

-    vrshr.s16 q8, q8, #4

-    vrshr.s16 q9, q9, #4

-    vld1.32 {d26[0]}, [r1], r2

-    vld1.32 {d26[1]}, [r1], r2

-    vld1.32 {d27[1]}, [r1], r2

-    vld1.32 {d27[0]}, [r1]  ; no post-increment

-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]

-    vaddw.u8 q8, q8, d26

-    vaddw.u8 q9, q9, d27

-    ; clip_pixel

-    vqmovun.s16 d26, q8

-    vqmovun.s16 d27, q9

-    ; do the stores in reverse order with negative post-increment, by changing

-    ; the sign of the stride

-    rsb r2, r2, #0

-    vst1.32 {d27[0]}, [r1], r2

-    vst1.32 {d27[1]}, [r1], r2

-    vst1.32 {d26[1]}, [r1], r2

-    vst1.32 {d26[0]}, [r1]  ; no post-increment

-    bx              lr

-    ENDP  ; |vp9_idct4x4_16_add_neon|

-    END

--- a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c

+++ /dev/null

@@ -1,64 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "vp9/common/vp9_idct.h"

-#include "vpx_ports/mem.h"

-void vp9_idct8x8_1_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8x8_t d2u8, d3u8, d30u8, d31u8;

-    uint64x1_t d2u64, d3u64, d4u64, d5u64;

-    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;

-    int16x8_t q0s16;

-    uint8_t *d1, *d2;

-    int16_t i, a1, cospi_16_64 = 11585;

-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-    out = dct_const_round_shift(out * cospi_16_64);

-    a1 = ROUND_POWER_OF_TWO(out, 5);

-    q0s16 = vdupq_n_s16(a1);

-    q0u16 = vreinterpretq_u16_s16(q0s16);

-    d1 = d2 = dest;

-    for (i = 0; i < 2; i++) {

-        d2u64 = vld1_u64((const uint64_t *)d1);

-        d1 += dest_stride;

-        d3u64 = vld1_u64((const uint64_t *)d1);

-        d1 += dest_stride;

-        d4u64 = vld1_u64((const uint64_t *)d1);

-        d1 += dest_stride;

-        d5u64 = vld1_u64((const uint64_t *)d1);

-        d1 += dest_stride;

-        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));

-        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));

-        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));

-        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));

-        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

-        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

-        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

-        d2 += dest_stride;

-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

-        d2 += dest_stride;

-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));

-        d2 += dest_stride;

-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));

-        d2 += dest_stride;

-    }

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm

+++ /dev/null

@@ -1,88 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |vp9_idct8x8_1_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,

-;                                  int dest_stride)

-;

-; r0  int16_t input

-; r1  uint8_t *dest

-; r2  int dest_stride)

-|vp9_idct8x8_1_add_neon| PROC

-    ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

-    ; out = dct_const_round_shift(input[0] * cospi_16_64)

-    mul              r0, r0, r12               ; input[0] * cospi_16_64

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; out = dct_const_round_shift(out * cospi_16_64)

-    mul              r0, r0, r12               ; out * cospi_16_64

-    mov              r12, r1                   ; save dest

-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

-    asr              r0, r0, #14               ; >> DCT_CONST_BITS

-    ; a1 = ROUND_POWER_OF_TWO(out, 5)

-    add              r0, r0, #16               ; + (1 <<((5) - 1))

-    asr              r0, r0, #5                ; >> 5

-    vdup.s16         q0, r0                    ; duplicate a1

-    ; load destination data

-    vld1.64          {d2}, [r1], r2

-    vld1.64          {d3}, [r1], r2

-    vld1.64          {d4}, [r1], r2

-    vld1.64          {d5}, [r1], r2

-    vld1.64          {d6}, [r1], r2

-    vld1.64          {d7}, [r1], r2

-    vld1.64          {d16}, [r1], r2

-    vld1.64          {d17}, [r1]

-    vaddw.u8         q9, q0, d2                ; dest[x] + a1

-    vaddw.u8         q10, q0, d3               ; dest[x] + a1

-    vaddw.u8         q11, q0, d4               ; dest[x] + a1

-    vaddw.u8         q12, q0, d5               ; dest[x] + a1

-    vqmovun.s16      d2, q9                    ; clip_pixel

-    vqmovun.s16      d3, q10                   ; clip_pixel

-    vqmovun.s16      d30, q11                  ; clip_pixel

-    vqmovun.s16      d31, q12                  ; clip_pixel

-    vst1.64          {d2}, [r12], r2

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r2

-    vst1.64          {d31}, [r12], r2

-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

-    vaddw.u8         q10, q0, d7                ; dest[x] + a1

-    vaddw.u8         q11, q0, d16               ; dest[x] + a1

-    vaddw.u8         q12, q0, d17               ; dest[x] + a1

-    vqmovun.s16      d2, q9                     ; clip_pixel

-    vqmovun.s16      d3, q10                    ; clip_pixel

-    vqmovun.s16      d30, q11                   ; clip_pixel

-    vqmovun.s16      d31, q12                   ; clip_pixel

-    vst1.64          {d2}, [r12], r2

-    vst1.64          {d3}, [r12], r2

-    vst1.64          {d30}, [r12], r2

-    vst1.64          {d31}, [r12], r2

-    bx               lr

-    ENDP             ; |vp9_idct8x8_1_add_neon|

-    END

--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.c

+++ /dev/null

@@ -1,540 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "./vpx_config.h"

-#include "vpx_dsp/txfm_common.h"

-static INLINE void TRANSPOSE8X8(

-        int16x8_t *q8s16,

-        int16x8_t *q9s16,

-        int16x8_t *q10s16,

-        int16x8_t *q11s16,

-        int16x8_t *q12s16,

-        int16x8_t *q13s16,

-        int16x8_t *q14s16,

-        int16x8_t *q15s16) {

-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;

-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;

-    d16s16 = vget_low_s16(*q8s16);

-    d17s16 = vget_high_s16(*q8s16);

-    d18s16 = vget_low_s16(*q9s16);

-    d19s16 = vget_high_s16(*q9s16);

-    d20s16 = vget_low_s16(*q10s16);

-    d21s16 = vget_high_s16(*q10s16);

-    d22s16 = vget_low_s16(*q11s16);

-    d23s16 = vget_high_s16(*q11s16);

-    d24s16 = vget_low_s16(*q12s16);

-    d25s16 = vget_high_s16(*q12s16);

-    d26s16 = vget_low_s16(*q13s16);

-    d27s16 = vget_high_s16(*q13s16);

-    d28s16 = vget_low_s16(*q14s16);

-    d29s16 = vget_high_s16(*q14s16);

-    d30s16 = vget_low_s16(*q15s16);

-    d31s16 = vget_high_s16(*q15s16);

-    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24

-    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26

-    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28

-    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30

-    *q12s16 = vcombine_s16(d17s16, d25s16);

-    *q13s16 = vcombine_s16(d19s16, d27s16);

-    *q14s16 = vcombine_s16(d21s16, d29s16);

-    *q15s16 = vcombine_s16(d23s16, d31s16);

-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),

-                        vreinterpretq_s32_s16(*q10s16));

-    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),

-                        vreinterpretq_s32_s16(*q11s16));

-    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),

-                        vreinterpretq_s32_s16(*q14s16));

-    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),

-                        vreinterpretq_s32_s16(*q15s16));

-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8

-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9

-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10

-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11

-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12

-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13

-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14

-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15

-    *q8s16  = q0x2s16.val[0];

-    *q9s16  = q0x2s16.val[1];

-    *q10s16 = q1x2s16.val[0];

-    *q11s16 = q1x2s16.val[1];

-    *q12s16 = q2x2s16.val[0];

-    *q13s16 = q2x2s16.val[1];

-    *q14s16 = q3x2s16.val[0];

-    *q15s16 = q3x2s16.val[1];

-    return;

-}

-static INLINE void IDCT8x8_1D(

-        int16x8_t *q8s16,

-        int16x8_t *q9s16,

-        int16x8_t *q10s16,

-        int16x8_t *q11s16,

-        int16x8_t *q12s16,

-        int16x8_t *q13s16,

-        int16x8_t *q14s16,

-        int16x8_t *q15s16) {

-    int16x4_t d0s16, d1s16, d2s16, d3s16;

-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

-    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;

-    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;

-    d0s16 = vdup_n_s16(cospi_28_64);

-    d1s16 = vdup_n_s16(cospi_4_64);

-    d2s16 = vdup_n_s16(cospi_12_64);

-    d3s16 = vdup_n_s16(cospi_20_64);

-    d16s16 = vget_low_s16(*q8s16);

-    d17s16 = vget_high_s16(*q8s16);

-    d18s16 = vget_low_s16(*q9s16);

-    d19s16 = vget_high_s16(*q9s16);

-    d20s16 = vget_low_s16(*q10s16);

-    d21s16 = vget_high_s16(*q10s16);

-    d22s16 = vget_low_s16(*q11s16);

-    d23s16 = vget_high_s16(*q11s16);

-    d24s16 = vget_low_s16(*q12s16);

-    d25s16 = vget_high_s16(*q12s16);

-    d26s16 = vget_low_s16(*q13s16);

-    d27s16 = vget_high_s16(*q13s16);

-    d28s16 = vget_low_s16(*q14s16);

-    d29s16 = vget_high_s16(*q14s16);

-    d30s16 = vget_low_s16(*q15s16);

-    d31s16 = vget_high_s16(*q15s16);

-    q2s32 = vmull_s16(d18s16, d0s16);

-    q3s32 = vmull_s16(d19s16, d0s16);

-    q5s32 = vmull_s16(d26s16, d2s16);

-    q6s32 = vmull_s16(d27s16, d2s16);

-    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);

-    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);

-    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);

-    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);

-    d8s16 = vqrshrn_n_s32(q2s32, 14);

-    d9s16 = vqrshrn_n_s32(q3s32, 14);

-    d10s16 = vqrshrn_n_s32(q5s32, 14);

-    d11s16 = vqrshrn_n_s32(q6s32, 14);

-    q4s16 = vcombine_s16(d8s16, d9s16);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q2s32 = vmull_s16(d18s16, d1s16);

-    q3s32 = vmull_s16(d19s16, d1s16);

-    q9s32 = vmull_s16(d26s16, d3s16);

-    q13s32 = vmull_s16(d27s16, d3s16);

-    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);

-    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);

-    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);

-    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);

-    d14s16 = vqrshrn_n_s32(q2s32, 14);

-    d15s16 = vqrshrn_n_s32(q3s32, 14);

-    d12s16 = vqrshrn_n_s32(q9s32, 14);

-    d13s16 = vqrshrn_n_s32(q13s32, 14);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    q7s16 = vcombine_s16(d14s16, d15s16);

-    d0s16 = vdup_n_s16(cospi_16_64);

-    q2s32 = vmull_s16(d16s16, d0s16);

-    q3s32 = vmull_s16(d17s16, d0s16);

-    q13s32 = vmull_s16(d16s16, d0s16);

-    q15s32 = vmull_s16(d17s16, d0s16);

-    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);

-    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);

-    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);

-    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);

-    d0s16 = vdup_n_s16(cospi_24_64);

-    d1s16 = vdup_n_s16(cospi_8_64);

-    d18s16 = vqrshrn_n_s32(q2s32, 14);

-    d19s16 = vqrshrn_n_s32(q3s32, 14);

-    d22s16 = vqrshrn_n_s32(q13s32, 14);

-    d23s16 = vqrshrn_n_s32(q15s32, 14);

-    *q9s16 = vcombine_s16(d18s16, d19s16);

-    *q11s16 = vcombine_s16(d22s16, d23s16);

-    q2s32 = vmull_s16(d20s16, d0s16);

-    q3s32 = vmull_s16(d21s16, d0s16);

-    q8s32 = vmull_s16(d20s16, d1s16);

-    q12s32 = vmull_s16(d21s16, d1s16);

-    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);

-    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);

-    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);

-    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);

-    d26s16 = vqrshrn_n_s32(q2s32, 14);

-    d27s16 = vqrshrn_n_s32(q3s32, 14);

-    d30s16 = vqrshrn_n_s32(q8s32, 14);

-    d31s16 = vqrshrn_n_s32(q12s32, 14);

-    *q13s16 = vcombine_s16(d26s16, d27s16);

-    *q15s16 = vcombine_s16(d30s16, d31s16);

-    q0s16 = vaddq_s16(*q9s16, *q15s16);

-    q1s16 = vaddq_s16(*q11s16, *q13s16);

-    q2s16 = vsubq_s16(*q11s16, *q13s16);

-    q3s16 = vsubq_s16(*q9s16, *q15s16);

-    *q13s16 = vsubq_s16(q4s16, q5s16);

-    q4s16 = vaddq_s16(q4s16, q5s16);

-    *q14s16 = vsubq_s16(q7s16, q6s16);

-    q7s16 = vaddq_s16(q7s16, q6s16);

-    d26s16 = vget_low_s16(*q13s16);

-    d27s16 = vget_high_s16(*q13s16);

-    d28s16 = vget_low_s16(*q14s16);

-    d29s16 = vget_high_s16(*q14s16);

-    d16s16 = vdup_n_s16(cospi_16_64);

-    q9s32 = vmull_s16(d28s16, d16s16);

-    q10s32 = vmull_s16(d29s16, d16s16);

-    q11s32 = vmull_s16(d28s16, d16s16);

-    q12s32 = vmull_s16(d29s16, d16s16);

-    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);

-    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);

-    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);

-    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

-    d10s16 = vqrshrn_n_s32(q9s32, 14);

-    d11s16 = vqrshrn_n_s32(q10s32, 14);

-    d12s16 = vqrshrn_n_s32(q11s32, 14);

-    d13s16 = vqrshrn_n_s32(q12s32, 14);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    *q8s16 = vaddq_s16(q0s16, q7s16);

-    *q9s16 = vaddq_s16(q1s16, q6s16);

-    *q10s16 = vaddq_s16(q2s16, q5s16);

-    *q11s16 = vaddq_s16(q3s16, q4s16);

-    *q12s16 = vsubq_s16(q3s16, q4s16);

-    *q13s16 = vsubq_s16(q2s16, q5s16);

-    *q14s16 = vsubq_s16(q1s16, q6s16);

-    *q15s16 = vsubq_s16(q0s16, q7s16);

-    return;

-}

-void vp9_idct8x8_64_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8_t *d1, *d2;

-    uint8x8_t d0u8, d1u8, d2u8, d3u8;

-    uint64x1_t d0u64, d1u64, d2u64, d3u64;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    uint16x8_t q8u16, q9u16, q10u16, q11u16;

-    q8s16 = vld1q_s16(input);

-    q9s16 = vld1q_s16(input + 8);

-    q10s16 = vld1q_s16(input + 16);

-    q11s16 = vld1q_s16(input + 24);

-    q12s16 = vld1q_s16(input + 32);

-    q13s16 = vld1q_s16(input + 40);

-    q14s16 = vld1q_s16(input + 48);

-    q15s16 = vld1q_s16(input + 56);

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,

-               &q12s16, &q13s16, &q14s16, &q15s16);

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,

-               &q12s16, &q13s16, &q14s16, &q15s16);

-    q8s16 = vrshrq_n_s16(q8s16, 5);

-    q9s16 = vrshrq_n_s16(q9s16, 5);

-    q10s16 = vrshrq_n_s16(q10s16, 5);

-    q11s16 = vrshrq_n_s16(q11s16, 5);

-    q12s16 = vrshrq_n_s16(q12s16, 5);

-    q13s16 = vrshrq_n_s16(q13s16, 5);

-    q14s16 = vrshrq_n_s16(q14s16, 5);

-    q15s16 = vrshrq_n_s16(q15s16, 5);

-    d1 = d2 = dest;

-    d0u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d1u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d2u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d3u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

-                     vreinterpret_u8_u64(d0u64));

-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

-                     vreinterpret_u8_u64(d1u64));

-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

-                      vreinterpret_u8_u64(d2u64));

-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

-                      vreinterpret_u8_u64(d3u64));

-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

-    d2 += dest_stride;

-    q8s16 = q12s16;

-    q9s16 = q13s16;

-    q10s16 = q14s16;

-    q11s16 = q15s16;

-    d0u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d1u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d2u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d3u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

-                     vreinterpret_u8_u64(d0u64));

-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

-                     vreinterpret_u8_u64(d1u64));

-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

-                      vreinterpret_u8_u64(d2u64));

-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

-                      vreinterpret_u8_u64(d3u64));

-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

-    d2 += dest_stride;

-    return;

-}

-void vp9_idct8x8_12_add_neon(

-        int16_t *input,

-        uint8_t *dest,

-        int dest_stride) {

-    uint8_t *d1, *d2;

-    uint8x8_t d0u8, d1u8, d2u8, d3u8;

-    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;

-    int16x4_t d26s16, d27s16, d28s16, d29s16;

-    uint64x1_t d0u64, d1u64, d2u64, d3u64;

-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

-    uint16x8_t q8u16, q9u16, q10u16, q11u16;

-    int32x4_t q9s32, q10s32, q11s32, q12s32;

-    q8s16 = vld1q_s16(input);

-    q9s16 = vld1q_s16(input + 8);

-    q10s16 = vld1q_s16(input + 16);

-    q11s16 = vld1q_s16(input + 24);

-    q12s16 = vld1q_s16(input + 32);

-    q13s16 = vld1q_s16(input + 40);

-    q14s16 = vld1q_s16(input + 48);

-    q15s16 = vld1q_s16(input + 56);

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    // First transform rows

-    // stage 1

-    q0s16 = vdupq_n_s16(cospi_28_64 * 2);

-    q1s16 = vdupq_n_s16(cospi_4_64 * 2);

-    q4s16 = vqrdmulhq_s16(q9s16, q0s16);

-    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);

-    q7s16 = vqrdmulhq_s16(q9s16, q1s16);

-    q1s16 = vdupq_n_s16(cospi_12_64 * 2);

-    q5s16 = vqrdmulhq_s16(q11s16, q0s16);

-    q0s16 = vdupq_n_s16(cospi_16_64 * 2);

-    q6s16 = vqrdmulhq_s16(q11s16, q1s16);

-    // stage 2 & stage 3 - even half

-    q1s16 = vdupq_n_s16(cospi_24_64 * 2);

-    q9s16 = vqrdmulhq_s16(q8s16, q0s16);

-    q0s16 = vdupq_n_s16(cospi_8_64 * 2);

-    q13s16 = vqrdmulhq_s16(q10s16, q1s16);

-    q15s16 = vqrdmulhq_s16(q10s16, q0s16);

-    // stage 3 -odd half

-    q0s16 = vaddq_s16(q9s16, q15s16);

-    q1s16 = vaddq_s16(q9s16, q13s16);

-    q2s16 = vsubq_s16(q9s16, q13s16);

-    q3s16 = vsubq_s16(q9s16, q15s16);

-    // stage 2 - odd half

-    q13s16 = vsubq_s16(q4s16, q5s16);

-    q4s16 = vaddq_s16(q4s16, q5s16);

-    q14s16 = vsubq_s16(q7s16, q6s16);

-    q7s16 = vaddq_s16(q7s16, q6s16);

-    d26s16 = vget_low_s16(q13s16);

-    d27s16 = vget_high_s16(q13s16);

-    d28s16 = vget_low_s16(q14s16);

-    d29s16 = vget_high_s16(q14s16);

-    d16s16 = vdup_n_s16(cospi_16_64);

-    q9s32 = vmull_s16(d28s16, d16s16);

-    q10s32 = vmull_s16(d29s16, d16s16);

-    q11s32 = vmull_s16(d28s16, d16s16);

-    q12s32 = vmull_s16(d29s16, d16s16);

-    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);

-    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);

-    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);

-    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

-    d10s16 = vqrshrn_n_s32(q9s32, 14);

-    d11s16 = vqrshrn_n_s32(q10s32, 14);

-    d12s16 = vqrshrn_n_s32(q11s32, 14);

-    d13s16 = vqrshrn_n_s32(q12s32, 14);

-    q5s16 = vcombine_s16(d10s16, d11s16);

-    q6s16 = vcombine_s16(d12s16, d13s16);

-    // stage 4

-    q8s16 = vaddq_s16(q0s16, q7s16);

-    q9s16 = vaddq_s16(q1s16, q6s16);

-    q10s16 = vaddq_s16(q2s16, q5s16);

-    q11s16 = vaddq_s16(q3s16, q4s16);

-    q12s16 = vsubq_s16(q3s16, q4s16);

-    q13s16 = vsubq_s16(q2s16, q5s16);

-    q14s16 = vsubq_s16(q1s16, q6s16);

-    q15s16 = vsubq_s16(q0s16, q7s16);

-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

-                 &q12s16, &q13s16, &q14s16, &q15s16);

-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,

-               &q12s16, &q13s16, &q14s16, &q15s16);

-    q8s16 = vrshrq_n_s16(q8s16, 5);

-    q9s16 = vrshrq_n_s16(q9s16, 5);

-    q10s16 = vrshrq_n_s16(q10s16, 5);

-    q11s16 = vrshrq_n_s16(q11s16, 5);

-    q12s16 = vrshrq_n_s16(q12s16, 5);

-    q13s16 = vrshrq_n_s16(q13s16, 5);

-    q14s16 = vrshrq_n_s16(q14s16, 5);

-    q15s16 = vrshrq_n_s16(q15s16, 5);

-    d1 = d2 = dest;

-    d0u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d1u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d2u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d3u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

-                     vreinterpret_u8_u64(d0u64));

-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

-                     vreinterpret_u8_u64(d1u64));

-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

-                      vreinterpret_u8_u64(d2u64));

-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

-                      vreinterpret_u8_u64(d3u64));

-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

-    d2 += dest_stride;

-    q8s16 = q12s16;

-    q9s16 = q13s16;

-    q10s16 = q14s16;

-    q11s16 = q15s16;

-    d0u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d1u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d2u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    d3u64 = vld1_u64((uint64_t *)d1);

-    d1 += dest_stride;

-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

-                     vreinterpret_u8_u64(d0u64));

-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

-                     vreinterpret_u8_u64(d1u64));

-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

-                      vreinterpret_u8_u64(d2u64));

-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

-                      vreinterpret_u8_u64(d3u64));

-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

-    d2 += dest_stride;

-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

-    d2 += dest_stride;

-    return;

-}

--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm

+++ /dev/null

@@ -1,519 +1,0 @@

-;

-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_idct8x8_64_add_neon|

-    EXPORT  |vp9_idct8x8_12_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are

-    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.

-    ; This macro will touch q0-q7 registers and use them as buffer during

-    ; calculation.

-    MACRO

-    IDCT8x8_1D

-    ; stage 1

-    vdup.16         d0, r3                    ; duplicate cospi_28_64

-    vdup.16         d1, r4                    ; duplicate cospi_4_64

-    vdup.16         d2, r5                    ; duplicate cospi_12_64

-    vdup.16         d3, r6                    ; duplicate cospi_20_64

-    ; input[1] * cospi_28_64

-    vmull.s16       q2, d18, d0

-    vmull.s16       q3, d19, d0

-    ; input[5] * cospi_12_64

-    vmull.s16       q5, d26, d2

-    vmull.s16       q6, d27, d2

-    ; input[1]*cospi_28_64-input[7]*cospi_4_64

-    vmlsl.s16       q2, d30, d1

-    vmlsl.s16       q3, d31, d1

-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64

-    vmlsl.s16       q5, d22, d3

-    vmlsl.s16       q6, d23, d3

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d8, q2, #14               ; >> 14

-    vqrshrn.s32     d9, q3, #14               ; >> 14

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d10, q5, #14              ; >> 14

-    vqrshrn.s32     d11, q6, #14              ; >> 14

-    ; input[1] * cospi_4_64

-    vmull.s16       q2, d18, d1

-    vmull.s16       q3, d19, d1

-    ; input[5] * cospi_20_64

-    vmull.s16       q9, d26, d3

-    vmull.s16       q13, d27, d3

-    ; input[1]*cospi_4_64+input[7]*cospi_28_64

-    vmlal.s16       q2, d30, d0

-    vmlal.s16       q3, d31, d0

-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64

-    vmlal.s16       q9, d22, d2

-    vmlal.s16       q13, d23, d2

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d14, q2, #14              ; >> 14

-    vqrshrn.s32     d15, q3, #14              ; >> 14

-    ; stage 2 & stage 3 - even half

-    vdup.16         d0, r7                    ; duplicate cospi_16_64

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d12, q9, #14              ; >> 14

-    vqrshrn.s32     d13, q13, #14              ; >> 14

-    ; input[0] * cospi_16_64

-    vmull.s16       q2, d16, d0

-    vmull.s16       q3, d17, d0

-    ; input[0] * cospi_16_64

-    vmull.s16       q13, d16, d0

-    vmull.s16       q15, d17, d0

-    ; (input[0] + input[2]) * cospi_16_64

-    vmlal.s16       q2,  d24, d0

-    vmlal.s16       q3, d25, d0

-    ; (input[0] - input[2]) * cospi_16_64

-    vmlsl.s16       q13, d24, d0

-    vmlsl.s16       q15, d25, d0

-    vdup.16         d0, r8                    ; duplicate cospi_24_64

-    vdup.16         d1, r9                    ; duplicate cospi_8_64

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d18, q2, #14              ; >> 14

-    vqrshrn.s32     d19, q3, #14              ; >> 14

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d22, q13, #14              ; >> 14

-    vqrshrn.s32     d23, q15, #14              ; >> 14

-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64

-    ; input[1] * cospi_24_64

-    vmull.s16       q2, d20, d0

-    vmull.s16       q3, d21, d0

-    ; input[1] * cospi_8_64

-    vmull.s16       q8, d20, d1

-    vmull.s16       q12, d21, d1

-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64

-    vmlsl.s16       q2, d28, d1

-    vmlsl.s16       q3, d29, d1

-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64

-    vmlal.s16       q8, d28, d0

-    vmlal.s16       q12, d29, d0

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d26, q2, #14              ; >> 14

-    vqrshrn.s32     d27, q3, #14              ; >> 14

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d30, q8, #14              ; >> 14

-    vqrshrn.s32     d31, q12, #14              ; >> 14

-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]

-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]

-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]

-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]

-    ; stage 3 -odd half

-    vdup.16         d16, r7                   ; duplicate cospi_16_64

-    ; stage 2 - odd half

-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]

-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]

-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]

-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]

-    ; step2[6] * cospi_16_64

-    vmull.s16       q9, d28, d16

-    vmull.s16       q10, d29, d16

-    ; step2[6] * cospi_16_64

-    vmull.s16       q11, d28, d16

-    vmull.s16       q12, d29, d16

-    ; (step2[6] - step2[5]) * cospi_16_64

-    vmlsl.s16       q9, d26, d16

-    vmlsl.s16       q10, d27, d16

-    ; (step2[5] + step2[6]) * cospi_16_64

-    vmlal.s16       q11, d26, d16

-    vmlal.s16       q12, d27, d16

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d10, q9, #14              ; >> 14

-    vqrshrn.s32     d11, q10, #14             ; >> 14

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d12, q11, #14              ; >> 14

-    vqrshrn.s32     d13, q12, #14             ; >> 14

-    ; stage 4

-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];

-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];

-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];

-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];

-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];

-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];

-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];

-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];

-    MEND

-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.

-    MACRO

-    TRANSPOSE8X8

-    vswp            d17, d24

-    vswp            d23, d30

-    vswp            d21, d28

-    vswp            d19, d26

-    vtrn.32         q8, q10

-    vtrn.32         q9, q11

-    vtrn.32         q12, q14

-    vtrn.32         q13, q15

-    vtrn.16         q8, q9

-    vtrn.16         q10, q11

-    vtrn.16         q12, q13

-    vtrn.16         q14, q15

-    MEND

-    AREA    Block, CODE, READONLY ; name this block of code

-;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

-;

-; r0  int16_t input

-; r1  uint8_t *dest

-; r2  int dest_stride)

-|vp9_idct8x8_64_add_neon| PROC

-    push            {r4-r9}

-    vpush           {d8-d15}

-    vld1.s16        {q8,q9}, [r0]!

-    vld1.s16        {q10,q11}, [r0]!

-    vld1.s16        {q12,q13}, [r0]!

-    vld1.s16        {q14,q15}, [r0]!

-    ; transpose the input data

-    TRANSPOSE8X8

-    ; generate  cospi_28_64 = 3196

-    mov             r3, #0x0c00

-    add             r3, #0x7c

-    ; generate cospi_4_64  = 16069

-    mov             r4, #0x3e00

-    add             r4, #0xc5

-    ; generate cospi_12_64 = 13623

-    mov             r5, #0x3500

-    add             r5, #0x37

-    ; generate cospi_20_64 = 9102

-    mov             r6, #0x2300

-    add             r6, #0x8e

-    ; generate cospi_16_64 = 11585

-    mov             r7, #0x2d00

-    add             r7, #0x41

-    ; generate cospi_24_64 = 6270

-    mov             r8, #0x1800

-    add             r8, #0x7e

-    ; generate cospi_8_64 = 15137

-    mov             r9, #0x3b00

-    add             r9, #0x21

-    ; First transform rows

-    IDCT8x8_1D

-    ; Transpose the matrix

-    TRANSPOSE8X8

-    ; Then transform columns

-    IDCT8x8_1D

-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)

-    vrshr.s16       q8, q8, #5

-    vrshr.s16       q9, q9, #5

-    vrshr.s16       q10, q10, #5

-    vrshr.s16       q11, q11, #5

-    vrshr.s16       q12, q12, #5

-    vrshr.s16       q13, q13, #5

-    vrshr.s16       q14, q14, #5

-    vrshr.s16       q15, q15, #5

-    ; save dest pointer

-    mov             r0, r1

-    ; load destination data

-    vld1.64         {d0}, [r1], r2

-    vld1.64         {d1}, [r1], r2

-    vld1.64         {d2}, [r1], r2

-    vld1.64         {d3}, [r1], r2

-    vld1.64         {d4}, [r1], r2

-    vld1.64         {d5}, [r1], r2

-    vld1.64         {d6}, [r1], r2

-    vld1.64         {d7}, [r1]

-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]

-    vaddw.u8        q8, q8, d0

-    vaddw.u8        q9, q9, d1

-    vaddw.u8        q10, q10, d2

-    vaddw.u8        q11, q11, d3

-    vaddw.u8        q12, q12, d4

-    vaddw.u8        q13, q13, d5

-    vaddw.u8        q14, q14, d6

-    vaddw.u8        q15, q15, d7

-    ; clip_pixel

-    vqmovun.s16     d0, q8

-    vqmovun.s16     d1, q9

-    vqmovun.s16     d2, q10

-    vqmovun.s16     d3, q11

-    vqmovun.s16     d4, q12

-    vqmovun.s16     d5, q13

-    vqmovun.s16     d6, q14

-    vqmovun.s16     d7, q15

-    ; store the data

-    vst1.64         {d0}, [r0], r2

-    vst1.64         {d1}, [r0], r2

-    vst1.64         {d2}, [r0], r2

-    vst1.64         {d3}, [r0], r2

-    vst1.64         {d4}, [r0], r2

-    vst1.64         {d5}, [r0], r2

-    vst1.64         {d6}, [r0], r2

-    vst1.64         {d7}, [r0], r2

-    vpop            {d8-d15}

-    pop             {r4-r9}

-    bx              lr

-    ENDP  ; |vp9_idct8x8_64_add_neon|

-;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

-;

-; r0  int16_t input

-; r1  uint8_t *dest

-; r2  int dest_stride)

-|vp9_idct8x8_12_add_neon| PROC

-    push            {r4-r9}

-    vpush           {d8-d15}

-    vld1.s16        {q8,q9}, [r0]!

-    vld1.s16        {q10,q11}, [r0]!

-    vld1.s16        {q12,q13}, [r0]!

-    vld1.s16        {q14,q15}, [r0]!

-    ; transpose the input data

-    TRANSPOSE8X8

-    ; generate  cospi_28_64 = 3196

-    mov             r3, #0x0c00

-    add             r3, #0x7c

-    ; generate cospi_4_64  = 16069

-    mov             r4, #0x3e00

-    add             r4, #0xc5

-    ; generate cospi_12_64 = 13623

-    mov             r5, #0x3500

-    add             r5, #0x37

-    ; generate cospi_20_64 = 9102

-    mov             r6, #0x2300

-    add             r6, #0x8e

-    ; generate cospi_16_64 = 11585

-    mov             r7, #0x2d00

-    add             r7, #0x41

-    ; generate cospi_24_64 = 6270

-    mov             r8, #0x1800

-    add             r8, #0x7e

-    ; generate cospi_8_64 = 15137

-    mov             r9, #0x3b00

-    add             r9, #0x21

-    ; First transform rows

-    ; stage 1

-    ; The following instructions use vqrdmulh to do the

-    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling

-    ; multiply and shift the result by 16 bits instead of 14 bits. So we need

-    ; to double the constants before multiplying to compensate this.

-    mov             r12, r3, lsl #1

-    vdup.16         q0, r12                   ; duplicate cospi_28_64*2

-    mov             r12, r4, lsl #1

-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2

-    ; dct_const_round_shift(input[1] * cospi_28_64)

-    vqrdmulh.s16    q4, q9, q0

-    mov             r12, r6, lsl #1

-    rsb             r12, #0

-    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2

-    ; dct_const_round_shift(input[1] * cospi_4_64)

-    vqrdmulh.s16    q7, q9, q1

-    mov             r12, r5, lsl #1

-    vdup.16         q1, r12                   ; duplicate cospi_12_64*2

-    ; dct_const_round_shift(- input[3] * cospi_20_64)

-    vqrdmulh.s16    q5, q11, q0

-    mov             r12, r7, lsl #1

-    vdup.16         q0, r12                   ; duplicate cospi_16_64*2

-    ; dct_const_round_shift(input[3] * cospi_12_64)

-    vqrdmulh.s16    q6, q11, q1

-    ; stage 2 & stage 3 - even half

-    mov             r12, r8, lsl #1

-    vdup.16         q1, r12                   ; duplicate cospi_24_64*2

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrdmulh.s16    q9, q8, q0

-    mov             r12, r9, lsl #1

-    vdup.16         q0, r12                   ; duplicate cospi_8_64*2

-    ; dct_const_round_shift(input[1] * cospi_24_64)

-    vqrdmulh.s16    q13, q10, q1

-    ; dct_const_round_shift(input[1] * cospi_8_64)

-    vqrdmulh.s16    q15, q10, q0

-    ; stage 3 -odd half

-    vdup.16         d16, r7                   ; duplicate cospi_16_64

-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]

-    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]

-    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]

-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]

-    ; stage 2 - odd half

-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]

-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]

-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]

-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]

-    ; step2[6] * cospi_16_64

-    vmull.s16       q9, d28, d16

-    vmull.s16       q10, d29, d16

-    ; step2[6] * cospi_16_64

-    vmull.s16       q11, d28, d16

-    vmull.s16       q12, d29, d16

-    ; (step2[6] - step2[5]) * cospi_16_64

-    vmlsl.s16       q9, d26, d16

-    vmlsl.s16       q10, d27, d16

-    ; (step2[5] + step2[6]) * cospi_16_64

-    vmlal.s16       q11, d26, d16

-    vmlal.s16       q12, d27, d16

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d10, q9, #14              ; >> 14

-    vqrshrn.s32     d11, q10, #14             ; >> 14

-    ; dct_const_round_shift(input_dc * cospi_16_64)

-    vqrshrn.s32     d12, q11, #14              ; >> 14

-    vqrshrn.s32     d13, q12, #14             ; >> 14

-    ; stage 4

-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];

-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];

-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];

-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];

-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];

-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];

-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];

-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];

-    ; Transpose the matrix

-    TRANSPOSE8X8

-    ; Then transform columns

-    IDCT8x8_1D

-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)

-    vrshr.s16       q8, q8, #5

-    vrshr.s16       q9, q9, #5

-    vrshr.s16       q10, q10, #5

-    vrshr.s16       q11, q11, #5

-    vrshr.s16       q12, q12, #5

-    vrshr.s16       q13, q13, #5

-    vrshr.s16       q14, q14, #5

-    vrshr.s16       q15, q15, #5

-    ; save dest pointer

-    mov             r0, r1

-    ; load destination data

-    vld1.64         {d0}, [r1], r2

-    vld1.64         {d1}, [r1], r2

-    vld1.64         {d2}, [r1], r2

-    vld1.64         {d3}, [r1], r2

-    vld1.64         {d4}, [r1], r2

-    vld1.64         {d5}, [r1], r2

-    vld1.64         {d6}, [r1], r2

-    vld1.64         {d7}, [r1]

-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]

-    vaddw.u8        q8, q8, d0

-    vaddw.u8        q9, q9, d1

-    vaddw.u8        q10, q10, d2

-    vaddw.u8        q11, q11, d3

-    vaddw.u8        q12, q12, d4

-    vaddw.u8        q13, q13, d5

-    vaddw.u8        q14, q14, d6

-    vaddw.u8        q15, q15, d7

-    ; clip_pixel

-    vqmovun.s16     d0, q8

-    vqmovun.s16     d1, q9

-    vqmovun.s16     d2, q10

-    vqmovun.s16     d3, q11

-    vqmovun.s16     d4, q12

-    vqmovun.s16     d5, q13

-    vqmovun.s16     d6, q14

-    vqmovun.s16     d7, q15

-    ; store the data

-    vst1.64         {d0}, [r0], r2

-    vst1.64         {d1}, [r0], r2

-    vst1.64         {d2}, [r0], r2

-    vst1.64         {d3}, [r0], r2

-    vst1.64         {d4}, [r0], r2

-    vst1.64         {d5}, [r0], r2

-    vst1.64         {d6}, [r0], r2

-    vst1.64         {d7}, [r0], r2

-    vpop            {d8-d15}

-    pop             {r4-r9}

-    bx              lr

-    ENDP  ; |vp9_idct8x8_12_add_neon|

-    END

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -11,283 +11,20 @@

 #include <math.h>

 #include "./vp9_rtcd.h"

-#include "vpx_ports/mem.h"

+#include "./vpx_dsp_rtcd.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_idct.h"

 #include "vp9/common/vp9_systemdependent.h"

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_ports/mem.h"

-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {

-  trans = WRAPLOW(trans, 8);

-  return clip_pixel(WRAPLOW(dest + trans, 8));

-}

-void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

-   0.5 shifts per pixel. */

-  int i;

-  tran_low_t output[16];

-  tran_high_t a1, b1, c1, d1, e1;

-  const tran_low_t *ip = input;

-  tran_low_t *op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] >> UNIT_QUANT_SHIFT;

-    c1 = ip[1] >> UNIT_QUANT_SHIFT;

-    d1 = ip[2] >> UNIT_QUANT_SHIFT;

-    b1 = ip[3] >> UNIT_QUANT_SHIFT;

-    a1 += c1;

-    d1 -= b1;

-    e1 = (a1 - d1) >> 1;

-    b1 = e1 - b1;

-    c1 = e1 - c1;

-    a1 -= b1;

-    d1 += c1;

-    op[0] = WRAPLOW(a1, 8);

-    op[1] = WRAPLOW(b1, 8);

-    op[2] = WRAPLOW(c1, 8);

-    op[3] = WRAPLOW(d1, 8);

-    ip += 4;

-    op += 4;

-  }

-  ip = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[4 * 0];

-    c1 = ip[4 * 1];

-    d1 = ip[4 * 2];

-    b1 = ip[4 * 3];

-    a1 += c1;

-    d1 -= b1;

-    e1 = (a1 - d1) >> 1;

-    b1 = e1 - b1;

-    c1 = e1 - c1;

-    a1 -= b1;

-    d1 += c1;

-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);

-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);

-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);

-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);

-    ip++;

-    dest++;

-  }

-}

-void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {

-  int i;

-  tran_high_t a1, e1;

-  tran_low_t tmp[4];

-  const tran_low_t *ip = in;

-  tran_low_t *op = tmp;

-  a1 = ip[0] >> UNIT_QUANT_SHIFT;

-  e1 = a1 >> 1;

-  a1 -= e1;

-  op[0] = WRAPLOW(a1, 8);

-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);

-  ip = tmp;

-  for (i = 0; i < 4; i++) {

-    e1 = ip[0] >> 1;

-    a1 = ip[0] - e1;

-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);

-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);

-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);

-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);

-    ip++;

-    dest++;

-  }

-}

-static void idct4(const tran_low_t *input, tran_low_t *output) {

-  tran_low_t step[4];

-  tran_high_t temp1, temp2;

-  // stage 1

-  temp1 = (input[0] + input[2]) * cospi_16_64;

-  temp2 = (input[0] - input[2]) * cospi_16_64;

-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  // stage 2

-  output[0] = WRAPLOW(step[0] + step[3], 8);

-  output[1] = WRAPLOW(step[1] + step[2], 8);

-  output[2] = WRAPLOW(step[1] - step[2], 8);

-  output[3] = WRAPLOW(step[0] - step[3], 8);

-}

-void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-  tran_low_t out[4 * 4];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[4], temp_out[4];

-  // Rows

-  for (i = 0; i < 4; ++i) {

-    idct4(input, outptr);

-    input += 4;

-    outptr += 4;

-  }

-  // Columns

-  for (i = 0; i < 4; ++i) {

-    for (j = 0; j < 4; ++j)

-      temp_in[j] = out[j * 4 + i];

-    idct4(temp_in, temp_out);

-    for (j = 0; j < 4; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));

-    }

-  }

-}

-void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,

-                         int dest_stride) {

-  int i;

-  tran_high_t a1;

-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

-  a1 = ROUND_POWER_OF_TWO(out, 4);

-  for (i = 0; i < 4; i++) {

-    dest[0] = clip_pixel_add(dest[0], a1);

-    dest[1] = clip_pixel_add(dest[1], a1);

-    dest[2] = clip_pixel_add(dest[2], a1);

-    dest[3] = clip_pixel_add(dest[3], a1);

-    dest += dest_stride;

-  }

-}

-static void idct8(const tran_low_t *input, tran_low_t *output) {

-  tran_low_t step1[8], step2[8];

-  tran_high_t temp1, temp2;

-  // stage 1

-  step1[0] = input[0];

-  step1[2] = input[4];

-  step1[1] = input[2];

-  step1[3] = input[6];

-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  // stage 2 & stage 3 - even half

-  idct4(step1, step1);

-  // stage 2 - odd half

-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);

-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);

-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);

-  // stage 3 -odd half

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[7] = step2[7];

-  // stage 4

-  output[0] = WRAPLOW(step1[0] + step1[7], 8);

-  output[1] = WRAPLOW(step1[1] + step1[6], 8);

-  output[2] = WRAPLOW(step1[2] + step1[5], 8);

-  output[3] = WRAPLOW(step1[3] + step1[4], 8);

-  output[4] = WRAPLOW(step1[3] - step1[4], 8);

-  output[5] = WRAPLOW(step1[2] - step1[5], 8);

-  output[6] = WRAPLOW(step1[1] - step1[6], 8);

-  output[7] = WRAPLOW(step1[0] - step1[7], 8);

-}

-void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-  tran_low_t out[8 * 8];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[8], temp_out[8];

-  // First transform rows

-  for (i = 0; i < 8; ++i) {

-    idct8(input, outptr);

-    input += 8;

-    outptr += 8;

-  }

-  // Then transform columns

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j)

-      temp_in[j] = out[j * 8 + i];

-    idct8(temp_in, temp_out);

-    for (j = 0; j < 8; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));

-    }

-  }

-}

-void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-  int i, j;

-  tran_high_t a1;

-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

-  a1 = ROUND_POWER_OF_TWO(out, 5);

-  for (j = 0; j < 8; ++j) {

-    for (i = 0; i < 8; ++i)

-      dest[i] = clip_pixel_add(dest[i], a1);

-    dest += stride;

-  }

-}

-static void iadst4(const tran_low_t *input, tran_low_t *output) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_low_t x0 = input[0];

-  tran_low_t x1 = input[1];

-  tran_low_t x2 = input[2];

-  tran_low_t x3 = input[3];

-  if (!(x0 | x1 | x2 | x3)) {

-    output[0] = output[1] = output[2] = output[3] = 0;

-    return;

-  }

-  s0 = sinpi_1_9 * x0;

-  s1 = sinpi_2_9 * x0;

-  s2 = sinpi_3_9 * x1;

-  s3 = sinpi_4_9 * x2;

-  s4 = sinpi_1_9 * x2;

-  s5 = sinpi_2_9 * x3;

-  s6 = sinpi_4_9 * x3;

-  s7 = x0 - x2 + x3;

-  s0 = s0 + s3 + s5;

-  s1 = s1 - s4 - s6;

-  s3 = s2;

-  s2 = sinpi_3_9 * s7;

-  // 1-D transform scaling factor is sqrt(2).

-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

-  // + 1b (addition) = 29b.

-  // Hence the output bit depth is 15b.

-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);

-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);

-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);

-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);

-}

 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,

                          int tx_type) {

   const transform_2d IHT_4[] = {

-    { idct4, idct4  },  // DCT_DCT  = 0

-    { iadst4, idct4  },   // ADST_DCT = 1

-    { idct4, iadst4 },    // DCT_ADST = 2

-    { iadst4, iadst4 }      // ADST_ADST = 3

+    { idct4_c, idct4_c  },  // DCT_DCT  = 0

+    { iadst4_c, idct4_c  },   // ADST_DCT = 1

+    { idct4_c, iadst4_c },    // DCT_ADST = 2

+    { iadst4_c, iadst4_c }      // ADST_ADST = 3

};

   int i, j;

@@ -314,88 +51,11 @@

-static void iadst8(const tran_low_t *input, tran_low_t *output) {

-  int s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_high_t x0 = input[7];

-  tran_high_t x1 = input[0];

-  tran_high_t x2 = input[5];

-  tran_high_t x3 = input[2];

-  tran_high_t x4 = input[3];

-  tran_high_t x5 = input[4];

-  tran_high_t x6 = input[1];

-  tran_high_t x7 = input[6];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

-    output[0] = output[1] = output[2] = output[3] = output[4]

-              = output[5] = output[6] = output[7] = 0;

-    return;

-  }

-  // stage 1

-  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);

-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);

-  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);

-  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);

-  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);

-  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);

-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);

-  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);

-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);

-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);

-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);

-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);

-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);

-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);

-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);

-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);

-  // stage 2

-  s0 = (int)x0;

-  s1 = (int)x1;

-  s2 = (int)x2;

-  s3 = (int)x3;

-  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);

-  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);

-  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);

-  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);

-  x0 = WRAPLOW(s0 + s2, 8);

-  x1 = WRAPLOW(s1 + s3, 8);

-  x2 = WRAPLOW(s0 - s2, 8);

-  x3 = WRAPLOW(s1 - s3, 8);

-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);

-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);

-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);

-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);

-  // stage 3

-  s2 = (int)(cospi_16_64 * (x2 + x3));

-  s3 = (int)(cospi_16_64 * (x2 - x3));

-  s6 = (int)(cospi_16_64 * (x6 + x7));

-  s7 = (int)(cospi_16_64 * (x6 - x7));

-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);

-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);

-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);

-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);

-  output[0] = WRAPLOW(x0, 8);

-  output[1] = WRAPLOW(-x4, 8);

-  output[2] = WRAPLOW(x6, 8);

-  output[3] = WRAPLOW(-x2, 8);

-  output[4] = WRAPLOW(x3, 8);

-  output[5] = WRAPLOW(-x7, 8);

-  output[6] = WRAPLOW(x5, 8);

-  output[7] = WRAPLOW(-x1, 8);

-}

 static const transform_2d IHT_8[] = {

-  { idct8,  idct8  },  // DCT_DCT  = 0

-  { iadst8, idct8  },  // ADST_DCT = 1

-  { idct8,  iadst8 },  // DCT_ADST = 2

-  { iadst8, iadst8 }   // ADST_ADST = 3

+  { idct8_c,  idct8_c  },  // DCT_DCT  = 0

+  { iadst8_c, idct8_c  },  // ADST_DCT = 1

+  { idct8_c,  iadst8_c },  // DCT_ADST = 2

+  { iadst8_c, iadst8_c }   // ADST_ADST = 3

};

 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,

@@ -425,400 +85,11 @@

-void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-  tran_low_t out[8 * 8] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[8], temp_out[8];

-  // First transform rows

-  // only first 4 row has non-zero coefs

-  for (i = 0; i < 4; ++i) {

-    idct8(input, outptr);

-    input += 8;

-    outptr += 8;

-  }

-  // Then transform columns

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j)

-      temp_in[j] = out[j * 8 + i];

-    idct8(temp_in, temp_out);

-    for (j = 0; j < 8; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));

-    }

-  }

-}

-static void idct16(const tran_low_t *input, tran_low_t *output) {

-  tran_low_t step1[16], step2[16];

-  tran_high_t temp1, temp2;

-  // stage 1

-  step1[0] = input[0/2];

-  step1[1] = input[16/2];

-  step1[2] = input[8/2];

-  step1[3] = input[24/2];

-  step1[4] = input[4/2];

-  step1[5] = input[20/2];

-  step1[6] = input[12/2];

-  step1[7] = input[28/2];

-  step1[8] = input[2/2];

-  step1[9] = input[18/2];

-  step1[10] = input[10/2];

-  step1[11] = input[26/2];

-  step1[12] = input[6/2];

-  step1[13] = input[22/2];

-  step1[14] = input[14/2];

-  step1[15] = input[30/2];

-  // stage 2

-  step2[0] = step1[0];

-  step2[1] = step1[1];

-  step2[2] = step1[2];

-  step2[3] = step1[3];

-  step2[4] = step1[4];

-  step2[5] = step1[5];

-  step2[6] = step1[6];

-  step2[7] = step1[7];

-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  // stage 3

-  step1[0] = step2[0];

-  step1[1] = step2[1];

-  step1[2] = step2[2];

-  step1[3] = step2[3];

-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);

-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);

-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);

-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);

-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);

-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);

-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);

-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);

-  // stage 4

-  temp1 = (step1[0] + step1[1]) * cospi_16_64;

-  temp2 = (step1[0] - step1[1]) * cospi_16_64;

-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);

-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);

-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);

-  step2[8] = step1[8];

-  step2[15] = step1[15];

-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step2[11] = step1[11];

-  step2[12] = step1[12];

-  // stage 5

-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);

-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);

-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);

-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[7] = step2[7];

-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);

-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);

-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);

-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);

-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);

-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);

-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);

-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);

-  // stage 6

-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);

-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);

-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);

-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);

-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);

-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);

-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);

-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);

-  step2[8] = step1[8];

-  step2[9] = step1[9];

-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

-  temp2 = (step1[10] + step1[13]) * cospi_16_64;

-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

-  temp2 = (step1[11] + step1[12]) * cospi_16_64;

-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step2[14] = step1[14];

-  step2[15] = step1[15];

-  // stage 7

-  output[0] = WRAPLOW(step2[0] + step2[15], 8);

-  output[1] = WRAPLOW(step2[1] + step2[14], 8);

-  output[2] = WRAPLOW(step2[2] + step2[13], 8);

-  output[3] = WRAPLOW(step2[3] + step2[12], 8);

-  output[4] = WRAPLOW(step2[4] + step2[11], 8);

-  output[5] = WRAPLOW(step2[5] + step2[10], 8);

-  output[6] = WRAPLOW(step2[6] + step2[9], 8);

-  output[7] = WRAPLOW(step2[7] + step2[8], 8);

-  output[8] = WRAPLOW(step2[7] - step2[8], 8);

-  output[9] = WRAPLOW(step2[6] - step2[9], 8);

-  output[10] = WRAPLOW(step2[5] - step2[10], 8);

-  output[11] = WRAPLOW(step2[4] - step2[11], 8);

-  output[12] = WRAPLOW(step2[3] - step2[12], 8);

-  output[13] = WRAPLOW(step2[2] - step2[13], 8);

-  output[14] = WRAPLOW(step2[1] - step2[14], 8);

-  output[15] = WRAPLOW(step2[0] - step2[15], 8);

-}

-void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,

-                             int stride) {

-  tran_low_t out[16 * 16];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[16], temp_out[16];

-  // First transform rows

-  for (i = 0; i < 16; ++i) {

-    idct16(input, outptr);

-    input += 16;

-    outptr += 16;

-  }

-  // Then transform columns

-  for (i = 0; i < 16; ++i) {

-    for (j = 0; j < 16; ++j)

-      temp_in[j] = out[j * 16 + i];

-    idct16(temp_in, temp_out);

-    for (j = 0; j < 16; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

-    }

-  }

-}

-static void iadst16(const tran_low_t *input, tran_low_t *output) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

-  tran_high_t s9, s10, s11, s12, s13, s14, s15;

-  tran_high_t x0 = input[15];

-  tran_high_t x1 = input[0];

-  tran_high_t x2 = input[13];

-  tran_high_t x3 = input[2];

-  tran_high_t x4 = input[11];

-  tran_high_t x5 = input[4];

-  tran_high_t x6 = input[9];

-  tran_high_t x7 = input[6];

-  tran_high_t x8 = input[7];

-  tran_high_t x9 = input[8];

-  tran_high_t x10 = input[5];

-  tran_high_t x11 = input[10];

-  tran_high_t x12 = input[3];

-  tran_high_t x13 = input[12];

-  tran_high_t x14 = input[1];

-  tran_high_t x15 = input[14];

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

-    output[0] = output[1] = output[2] = output[3] = output[4]

-              = output[5] = output[6] = output[7] = output[8]

-              = output[9] = output[10] = output[11] = output[12]

-              = output[13] = output[14] = output[15] = 0;

-    return;

-  }

-  // stage 1

-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);

-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);

-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);

-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);

-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);

-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);

-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);

-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);

-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);

-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);

-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);

-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);

-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);

-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);

-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);

-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4;

-  s5 = x5;

-  s6 = x6;

-  s7 = x7;

-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

-  x0 = WRAPLOW(s0 + s4, 8);

-  x1 = WRAPLOW(s1 + s5, 8);

-  x2 = WRAPLOW(s2 + s6, 8);

-  x3 = WRAPLOW(s3 + s7, 8);

-  x4 = WRAPLOW(s0 - s4, 8);

-  x5 = WRAPLOW(s1 - s5, 8);

-  x6 = WRAPLOW(s2 - s6, 8);

-  x7 = WRAPLOW(s3 - s7, 8);

-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);

-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);

-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);

-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);

-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);

-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);

-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);

-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);

-  // stage 3

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

-  s8 = x8;

-  s9 = x9;

-  s10 = x10;

-  s11 = x11;

-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

-  x0 = WRAPLOW(check_range(s0 + s2), 8);

-  x1 = WRAPLOW(check_range(s1 + s3), 8);

-  x2 = WRAPLOW(check_range(s0 - s2), 8);

-  x3 = WRAPLOW(check_range(s1 - s3), 8);

-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);

-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);

-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);

-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);

-  x8 = WRAPLOW(check_range(s8 + s10), 8);

-  x9 = WRAPLOW(check_range(s9 + s11), 8);

-  x10 = WRAPLOW(check_range(s8 - s10), 8);

-  x11 = WRAPLOW(check_range(s9 - s11), 8);

-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);

-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);

-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);

-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);

-  // stage 4

-  s2 = (- cospi_16_64) * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (- x6 + x7);

-  s10 = cospi_16_64 * (x10 + x11);

-  s11 = cospi_16_64 * (- x10 + x11);

-  s14 = (- cospi_16_64) * (x14 + x15);

-  s15 = cospi_16_64 * (x14 - x15);

-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);

-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);

-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);

-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);

-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);

-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);

-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);

-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);

-  output[0] = WRAPLOW(x0, 8);

-  output[1] = WRAPLOW(-x8, 8);

-  output[2] = WRAPLOW(x12, 8);

-  output[3] = WRAPLOW(-x4, 8);

-  output[4] = WRAPLOW(x6, 8);

-  output[5] = WRAPLOW(x14, 8);

-  output[6] = WRAPLOW(x10, 8);

-  output[7] = WRAPLOW(x2, 8);

-  output[8] = WRAPLOW(x3, 8);

-  output[9] = WRAPLOW(x11, 8);

-  output[10] = WRAPLOW(x15, 8);

-  output[11] = WRAPLOW(x7, 8);

-  output[12] = WRAPLOW(x5, 8);

-  output[13] = WRAPLOW(-x13, 8);

-  output[14] = WRAPLOW(x9, 8);

-  output[15] = WRAPLOW(-x1, 8);

-}

 static const transform_2d IHT_16[] = {

-  { idct16,  idct16  },  // DCT_DCT  = 0

-  { iadst16, idct16  },  // ADST_DCT = 1

-  { idct16,  iadst16 },  // DCT_ADST = 2

-  { iadst16, iadst16 }   // ADST_ADST = 3

+  { idct16_c,  idct16_c  },  // DCT_DCT  = 0

+  { iadst16_c, idct16_c  },  // ADST_DCT = 1

+  { idct16_c,  iadst16_c },  // DCT_ADST = 2

+  { iadst16_c, iadst16_c }   // ADST_ADST = 3

};

 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,

@@ -848,494 +119,6 @@

-void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,

-                            int stride) {

-  tran_low_t out[16 * 16] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[16], temp_out[16];

-  // First transform rows. Since all non-zero dct coefficients are in

-  // upper-left 4x4 area, we only need to calculate first 4 rows here.

-  for (i = 0; i < 4; ++i) {

-    idct16(input, outptr);

-    input += 16;

-    outptr += 16;

-  }

-  // Then transform columns

-  for (i = 0; i < 16; ++i) {

-    for (j = 0; j < 16; ++j)

-      temp_in[j] = out[j*16 + i];

-    idct16(temp_in, temp_out);

-    for (j = 0; j < 16; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

-    }

-  }

-}

-void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-  int i, j;

-  tran_high_t a1;

-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

-  a1 = ROUND_POWER_OF_TWO(out, 6);

-  for (j = 0; j < 16; ++j) {

-    for (i = 0; i < 16; ++i)

-      dest[i] = clip_pixel_add(dest[i], a1);

-    dest += stride;

-  }

-}

-static void idct32(const tran_low_t *input, tran_low_t *output) {

-  tran_low_t step1[32], step2[32];

-  tran_high_t temp1, temp2;

-  // stage 1

-  step1[0] = input[0];

-  step1[1] = input[16];

-  step1[2] = input[8];

-  step1[3] = input[24];

-  step1[4] = input[4];

-  step1[5] = input[20];

-  step1[6] = input[12];

-  step1[7] = input[28];

-  step1[8] = input[2];

-  step1[9] = input[18];

-  step1[10] = input[10];

-  step1[11] = input[26];

-  step1[12] = input[6];

-  step1[13] = input[22];

-  step1[14] = input[14];

-  step1[15] = input[30];

-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  // stage 2

-  step2[0] = step1[0];

-  step2[1] = step1[1];

-  step2[2] = step1[2];

-  step2[3] = step1[3];

-  step2[4] = step1[4];

-  step2[5] = step1[5];

-  step2[6] = step1[6];

-  step2[7] = step1[7];

-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);

-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);

-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);

-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);

-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);

-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);

-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);

-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);

-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);

-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);

-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);

-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);

-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);

-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);

-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);

-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);

-  // stage 3

-  step1[0] = step2[0];

-  step1[1] = step2[1];

-  step1[2] = step2[2];

-  step1[3] = step2[3];

-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);

-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);

-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);

-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);

-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);

-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);

-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);

-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);

-  step1[16] = step2[16];

-  step1[31] = step2[31];

-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[19] = step2[19];

-  step1[20] = step2[20];

-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[23] = step2[23];

-  step1[24] = step2[24];

-  step1[27] = step2[27];

-  step1[28] = step2[28];

-  // stage 4

-  temp1 = (step1[0] + step1[1]) * cospi_16_64;

-  temp2 = (step1[0] - step1[1]) * cospi_16_64;

-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);

-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);

-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);

-  step2[8] = step1[8];

-  step2[15] = step1[15];

-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step2[11] = step1[11];

-  step2[12] = step1[12];

-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);

-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);

-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);

-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);

-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);

-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);

-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);

-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);

-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);

-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);

-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);

-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);

-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);

-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);

-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);

-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);

-  // stage 5

-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);

-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);

-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);

-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[7] = step2[7];

-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);

-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);

-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);

-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);

-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);

-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);

-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);

-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);

-  step1[16] = step2[16];

-  step1[17] = step2[17];

-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[22] = step2[22];

-  step1[23] = step2[23];

-  step1[24] = step2[24];

-  step1[25] = step2[25];

-  step1[30] = step2[30];

-  step1[31] = step2[31];

-  // stage 6

-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);

-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);

-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);

-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);

-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);

-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);

-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);

-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);

-  step2[8] = step1[8];

-  step2[9] = step1[9];

-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

-  temp2 = (step1[10] + step1[13]) * cospi_16_64;

-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

-  temp2 = (step1[11] + step1[12]) * cospi_16_64;

-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step2[14] = step1[14];

-  step2[15] = step1[15];

-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);

-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);

-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);

-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);

-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);

-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);

-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);

-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);

-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);

-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);

-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);

-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);

-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);

-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);

-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);

-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);

-  // stage 7

-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);

-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);

-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);

-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);

-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);

-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);

-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);

-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);

-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);

-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);

-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);

-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);

-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);

-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);

-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);

-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);

-  step1[16] = step2[16];

-  step1[17] = step2[17];

-  step1[18] = step2[18];

-  step1[19] = step2[19];

-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

-  temp2 = (step2[20] + step2[27]) * cospi_16_64;

-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

-  temp2 = (step2[21] + step2[26]) * cospi_16_64;

-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

-  temp2 = (step2[22] + step2[25]) * cospi_16_64;

-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

-  temp2 = (step2[23] + step2[24]) * cospi_16_64;

-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);

-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);

-  step1[28] = step2[28];

-  step1[29] = step2[29];

-  step1[30] = step2[30];

-  step1[31] = step2[31];

-  // final stage

-  output[0] = WRAPLOW(step1[0] + step1[31], 8);

-  output[1] = WRAPLOW(step1[1] + step1[30], 8);

-  output[2] = WRAPLOW(step1[2] + step1[29], 8);

-  output[3] = WRAPLOW(step1[3] + step1[28], 8);

-  output[4] = WRAPLOW(step1[4] + step1[27], 8);

-  output[5] = WRAPLOW(step1[5] + step1[26], 8);

-  output[6] = WRAPLOW(step1[6] + step1[25], 8);

-  output[7] = WRAPLOW(step1[7] + step1[24], 8);

-  output[8] = WRAPLOW(step1[8] + step1[23], 8);

-  output[9] = WRAPLOW(step1[9] + step1[22], 8);

-  output[10] = WRAPLOW(step1[10] + step1[21], 8);

-  output[11] = WRAPLOW(step1[11] + step1[20], 8);

-  output[12] = WRAPLOW(step1[12] + step1[19], 8);

-  output[13] = WRAPLOW(step1[13] + step1[18], 8);

-  output[14] = WRAPLOW(step1[14] + step1[17], 8);

-  output[15] = WRAPLOW(step1[15] + step1[16], 8);

-  output[16] = WRAPLOW(step1[15] - step1[16], 8);

-  output[17] = WRAPLOW(step1[14] - step1[17], 8);

-  output[18] = WRAPLOW(step1[13] - step1[18], 8);

-  output[19] = WRAPLOW(step1[12] - step1[19], 8);

-  output[20] = WRAPLOW(step1[11] - step1[20], 8);

-  output[21] = WRAPLOW(step1[10] - step1[21], 8);

-  output[22] = WRAPLOW(step1[9] - step1[22], 8);

-  output[23] = WRAPLOW(step1[8] - step1[23], 8);

-  output[24] = WRAPLOW(step1[7] - step1[24], 8);

-  output[25] = WRAPLOW(step1[6] - step1[25], 8);

-  output[26] = WRAPLOW(step1[5] - step1[26], 8);

-  output[27] = WRAPLOW(step1[4] - step1[27], 8);

-  output[28] = WRAPLOW(step1[3] - step1[28], 8);

-  output[29] = WRAPLOW(step1[2] - step1[29], 8);

-  output[30] = WRAPLOW(step1[1] - step1[30], 8);

-  output[31] = WRAPLOW(step1[0] - step1[31], 8);

-}

-void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,

-                              int stride) {

-  tran_low_t out[32 * 32];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[32], temp_out[32];

-  // Rows

-  for (i = 0; i < 32; ++i) {

-    int16_t zero_coeff[16];

-    for (j = 0; j < 16; ++j)

-      zero_coeff[j] = input[2 * j] | input[2 * j + 1];

-    for (j = 0; j < 8; ++j)

-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

-    for (j = 0; j < 4; ++j)

-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

-    for (j = 0; j < 2; ++j)

-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

-    if (zero_coeff[0] | zero_coeff[1])

-      idct32(input, outptr);

-    else

-      memset(outptr, 0, sizeof(tran_low_t) * 32);

-    input += 32;

-    outptr += 32;

-  }

-  // Columns

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 32; ++j)

-      temp_in[j] = out[j * 32 + i];

-    idct32(temp_in, temp_out);

-    for (j = 0; j < 32; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

-    }

-  }

-}

-void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,

-                            int stride) {

-  tran_low_t out[32 * 32] = {0};

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[32], temp_out[32];

-  // Rows

-  // only upper-left 8x8 has non-zero coeff

-  for (i = 0; i < 8; ++i) {

-    idct32(input, outptr);

-    input += 32;

-    outptr += 32;

-  }

-  // Columns

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 32; ++j)

-      temp_in[j] = out[j * 32 + i];

-    idct32(temp_in, temp_out);

-    for (j = 0; j < 32; ++j) {

-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

-    }

-  }

-}

-void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

-  int i, j;

-  tran_high_t a1;

-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

-  a1 = ROUND_POWER_OF_TWO(out, 6);

-  for (j = 0; j < 32; ++j) {

-    for (i = 0; i < 32; ++i)

-      dest[i] = clip_pixel_add(dest[i], a1);

-    dest += stride;

-  }

-}

 // idct

 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

                      int eob) {

@@ -1424,294 +207,13 @@

 #if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

-                                 int stride, int bd) {

-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

-     0.5 shifts per pixel. */

-  int i;

-  tran_low_t output[16];

-  tran_high_t a1, b1, c1, d1, e1;

-  const tran_low_t *ip = input;

-  tran_low_t *op = output;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] >> UNIT_QUANT_SHIFT;

-    c1 = ip[1] >> UNIT_QUANT_SHIFT;

-    d1 = ip[2] >> UNIT_QUANT_SHIFT;

-    b1 = ip[3] >> UNIT_QUANT_SHIFT;

-    a1 += c1;

-    d1 -= b1;

-    e1 = (a1 - d1) >> 1;

-    b1 = e1 - b1;

-    c1 = e1 - c1;

-    a1 -= b1;

-    d1 += c1;

-    op[0] = WRAPLOW(a1, bd);

-    op[1] = WRAPLOW(b1, bd);

-    op[2] = WRAPLOW(c1, bd);

-    op[3] = WRAPLOW(d1, bd);

-    ip += 4;

-    op += 4;

-  }

-  ip = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[4 * 0];

-    c1 = ip[4 * 1];

-    d1 = ip[4 * 2];

-    b1 = ip[4 * 3];

-    a1 += c1;

-    d1 -= b1;

-    e1 = (a1 - d1) >> 1;

-    b1 = e1 - b1;

-    c1 = e1 - c1;

-    a1 -= b1;

-    d1 += c1;

-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);

-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);

-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);

-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);

-    ip++;

-    dest++;

-  }

-}

-void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,

-                                int dest_stride, int bd) {

-  int i;

-  tran_high_t a1, e1;

-  tran_low_t tmp[4];

-  const tran_low_t *ip = in;

-  tran_low_t *op = tmp;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  (void) bd;

-  a1 = ip[0] >> UNIT_QUANT_SHIFT;

-  e1 = a1 >> 1;

-  a1 -= e1;

-  op[0] = WRAPLOW(a1, bd);

-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);

-  ip = tmp;

-  for (i = 0; i < 4; i++) {

-    e1 = ip[0] >> 1;

-    a1 = ip[0] - e1;

-    dest[dest_stride * 0] = highbd_clip_pixel_add(

-        dest[dest_stride * 0], a1, bd);

-    dest[dest_stride * 1] = highbd_clip_pixel_add(

-        dest[dest_stride * 1], e1, bd);

-    dest[dest_stride * 2] = highbd_clip_pixel_add(

-        dest[dest_stride * 2], e1, bd);

-    dest[dest_stride * 3] = highbd_clip_pixel_add(

-        dest[dest_stride * 3], e1, bd);

-    ip++;

-    dest++;

-  }

-}

-void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_low_t step[4];

-  tran_high_t temp1, temp2;

-  (void) bd;

-  // stage 1

-  temp1 = (input[0] + input[2]) * cospi_16_64;

-  temp2 = (input[0] - input[2]) * cospi_16_64;

-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  // stage 2

-  output[0] = WRAPLOW(step[0] + step[3], bd);

-  output[1] = WRAPLOW(step[1] + step[2], bd);

-  output[2] = WRAPLOW(step[1] - step[2], bd);

-  output[3] = WRAPLOW(step[0] - step[3], bd);

-}

-void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

-                                 int stride, int bd) {

-  tran_low_t out[4 * 4];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[4], temp_out[4];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // Rows

-  for (i = 0; i < 4; ++i) {

-    vp9_highbd_idct4(input, outptr, bd);

-    input += 4;

-    outptr += 4;

-  }

-  // Columns

-  for (i = 0; i < 4; ++i) {

-    for (j = 0; j < 4; ++j)

-      temp_in[j] = out[j * 4 + i];

-    vp9_highbd_idct4(temp_in, temp_out, bd);

-    for (j = 0; j < 4; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

-    }

-  }

-}

-void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,

-                                int dest_stride, int bd) {

-  int i;

-  tran_high_t a1;

-  tran_low_t out = WRAPLOW(

-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

-  a1 = ROUND_POWER_OF_TWO(out, 4);

-  for (i = 0; i < 4; i++) {

-    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);

-    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);

-    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);

-    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);

-    dest += dest_stride;

-  }

-}

-void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_low_t step1[8], step2[8];

-  tran_high_t temp1, temp2;

-  // stage 1

-  step1[0] = input[0];

-  step1[2] = input[4];

-  step1[1] = input[2];

-  step1[3] = input[6];

-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  // stage 2 & stage 3 - even half

-  vp9_highbd_idct4(step1, step1, bd);

-  // stage 2 - odd half

-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);

-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);

-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);

-  // stage 3 - odd half

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[7] = step2[7];

-  // stage 4

-  output[0] = WRAPLOW(step1[0] + step1[7], bd);

-  output[1] = WRAPLOW(step1[1] + step1[6], bd);

-  output[2] = WRAPLOW(step1[2] + step1[5], bd);

-  output[3] = WRAPLOW(step1[3] + step1[4], bd);

-  output[4] = WRAPLOW(step1[3] - step1[4], bd);

-  output[5] = WRAPLOW(step1[2] - step1[5], bd);

-  output[6] = WRAPLOW(step1[1] - step1[6], bd);

-  output[7] = WRAPLOW(step1[0] - step1[7], bd);

-}

-void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

-                                 int stride, int bd) {

-  tran_low_t out[8 * 8];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[8], temp_out[8];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // First transform rows.

-  for (i = 0; i < 8; ++i) {

-    vp9_highbd_idct8(input, outptr, bd);

-    input += 8;

-    outptr += 8;

-  }

-  // Then transform columns.

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j)

-      temp_in[j] = out[j * 8 + i];

-    vp9_highbd_idct8(temp_in, temp_out, bd);

-    for (j = 0; j < 8; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-    }

-  }

-}

-void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,

-                                int stride, int bd) {

-  int i, j;

-  tran_high_t a1;

-  tran_low_t out = WRAPLOW(

-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

-  a1 = ROUND_POWER_OF_TWO(out, 5);

-  for (j = 0; j < 8; ++j) {

-    for (i = 0; i < 8; ++i)

-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

-    dest += stride;

-  }

-}

-static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_low_t x0 = input[0];

-  tran_low_t x1 = input[1];

-  tran_low_t x2 = input[2];

-  tran_low_t x3 = input[3];

-  (void) bd;

-  if (!(x0 | x1 | x2 | x3)) {

-    memset(output, 0, 4 * sizeof(*output));

-    return;

-  }

-  s0 = sinpi_1_9 * x0;

-  s1 = sinpi_2_9 * x0;

-  s2 = sinpi_3_9 * x1;

-  s3 = sinpi_4_9 * x2;

-  s4 = sinpi_1_9 * x2;

-  s5 = sinpi_2_9 * x3;

-  s6 = sinpi_4_9 * x3;

-  s7 = (tran_high_t)(x0 - x2 + x3);

-  s0 = s0 + s3 + s5;

-  s1 = s1 - s4 - s6;

-  s3 = s2;

-  s2 = sinpi_3_9 * s7;

-  // 1-D transform scaling factor is sqrt(2).

-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

-  // + 1b (addition) = 29b.

-  // Hence the output bit depth is 15b.

-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);

-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);

-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);

-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);

-}

 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

                                 int stride, int tx_type, int bd) {

   const highbd_transform_2d IHT_4[] = {

-    { vp9_highbd_idct4, vp9_highbd_idct4  },    // DCT_DCT  = 0

-    { highbd_iadst4, vp9_highbd_idct4 },    // ADST_DCT = 1

-    { vp9_highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2

-    { highbd_iadst4, highbd_iadst4 }    // ADST_ADST = 3

+    { vp9_highbd_idct4_c, vp9_highbd_idct4_c  },    // DCT_DCT  = 0

+    { highbd_iadst4_c, vp9_highbd_idct4_c },    // ADST_DCT = 1

+    { vp9_highbd_idct4_c, highbd_iadst4_c },    // DCT_ADST = 2

+    { highbd_iadst4_c, highbd_iadst4_c }    // ADST_ADST = 3

};

   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

@@ -1739,88 +241,11 @@

-static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

-  tran_low_t x0 = input[7];

-  tran_low_t x1 = input[0];

-  tran_low_t x2 = input[5];

-  tran_low_t x3 = input[2];

-  tran_low_t x4 = input[3];

-  tran_low_t x5 = input[4];

-  tran_low_t x6 = input[1];

-  tran_low_t x7 = input[6];

-  (void) bd;

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

-    memset(output, 0, 8 * sizeof(*output));

-    return;

-  }

-  // stage 1

-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);

-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);

-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);

-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);

-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);

-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);

-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);

-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

-  x0 = WRAPLOW(s0 + s2, bd);

-  x1 = WRAPLOW(s1 + s3, bd);

-  x2 = WRAPLOW(s0 - s2, bd);

-  x3 = WRAPLOW(s1 - s3, bd);

-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);

-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);

-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);

-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);

-  // stage 3

-  s2 = cospi_16_64 * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (x6 - x7);

-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);

-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);

-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);

-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);

-  output[0] = WRAPLOW(x0, bd);

-  output[1] = WRAPLOW(-x4, bd);

-  output[2] = WRAPLOW(x6, bd);

-  output[3] = WRAPLOW(-x2, bd);

-  output[4] = WRAPLOW(x3, bd);

-  output[5] = WRAPLOW(-x7, bd);

-  output[6] = WRAPLOW(x5, bd);

-  output[7] = WRAPLOW(-x1, bd);

-}

 static const highbd_transform_2d HIGH_IHT_8[] = {

-  { vp9_highbd_idct8,  vp9_highbd_idct8  },  // DCT_DCT  = 0

-  { highbd_iadst8, vp9_highbd_idct8  },  // ADST_DCT = 1

-  { vp9_highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2

-  { highbd_iadst8, highbd_iadst8 }   // ADST_ADST = 3

+  { vp9_highbd_idct8_c,  vp9_highbd_idct8_c  },  // DCT_DCT  = 0

+  { highbd_iadst8_c, vp9_highbd_idct8_c  },  // ADST_DCT = 1

+  { vp9_highbd_idct8_c,  highbd_iadst8_c },  // DCT_ADST = 2

+  { highbd_iadst8_c, highbd_iadst8_c }   // ADST_ADST = 3

};

 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

@@ -1851,402 +276,11 @@

-void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,

-                                 int stride, int bd) {

-  tran_low_t out[8 * 8] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[8], temp_out[8];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // First transform rows.

-  // Only first 4 row has non-zero coefs.

-  for (i = 0; i < 4; ++i) {

-    vp9_highbd_idct8(input, outptr, bd);

-    input += 8;

-    outptr += 8;

-  }

-  // Then transform columns.

-  for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j)

-      temp_in[j] = out[j * 8 + i];

-    vp9_highbd_idct8(temp_in, temp_out, bd);

-    for (j = 0; j < 8; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-    }

-  }

-}

-void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_low_t step1[16], step2[16];

-  tran_high_t temp1, temp2;

-  (void) bd;

-  // stage 1

-  step1[0] = input[0/2];

-  step1[1] = input[16/2];

-  step1[2] = input[8/2];

-  step1[3] = input[24/2];

-  step1[4] = input[4/2];

-  step1[5] = input[20/2];

-  step1[6] = input[12/2];

-  step1[7] = input[28/2];

-  step1[8] = input[2/2];

-  step1[9] = input[18/2];

-  step1[10] = input[10/2];

-  step1[11] = input[26/2];

-  step1[12] = input[6/2];

-  step1[13] = input[22/2];

-  step1[14] = input[14/2];

-  step1[15] = input[30/2];

-  // stage 2

-  step2[0] = step1[0];

-  step2[1] = step1[1];

-  step2[2] = step1[2];

-  step2[3] = step1[3];

-  step2[4] = step1[4];

-  step2[5] = step1[5];

-  step2[6] = step1[6];

-  step2[7] = step1[7];

-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  // stage 3

-  step1[0] = step2[0];

-  step1[1] = step2[1];

-  step1[2] = step2[2];

-  step1[3] = step2[3];

-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);

-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);

-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);

-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);

-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);

-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);

-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);

-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);

-  // stage 4

-  temp1 = (step1[0] + step1[1]) * cospi_16_64;

-  temp2 = (step1[0] - step1[1]) * cospi_16_64;

-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);

-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);

-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);

-  step2[8] = step1[8];

-  step2[15] = step1[15];

-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step2[11] = step1[11];

-  step2[12] = step1[12];

-  // stage 5

-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);

-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);

-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);

-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[7] = step2[7];

-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);

-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);

-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);

-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);

-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);

-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);

-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);

-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);

-  // stage 6

-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);

-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);

-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);

-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);

-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);

-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);

-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);

-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);

-  step2[8] = step1[8];

-  step2[9] = step1[9];

-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

-  temp2 = (step1[10] + step1[13]) * cospi_16_64;

-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

-  temp2 = (step1[11] + step1[12]) * cospi_16_64;

-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step2[14] = step1[14];

-  step2[15] = step1[15];

-  // stage 7

-  output[0] = WRAPLOW(step2[0] + step2[15], bd);

-  output[1] = WRAPLOW(step2[1] + step2[14], bd);

-  output[2] = WRAPLOW(step2[2] + step2[13], bd);

-  output[3] = WRAPLOW(step2[3] + step2[12], bd);

-  output[4] = WRAPLOW(step2[4] + step2[11], bd);

-  output[5] = WRAPLOW(step2[5] + step2[10], bd);

-  output[6] = WRAPLOW(step2[6] + step2[9], bd);

-  output[7] = WRAPLOW(step2[7] + step2[8], bd);

-  output[8] = WRAPLOW(step2[7] - step2[8], bd);

-  output[9] = WRAPLOW(step2[6] - step2[9], bd);

-  output[10] = WRAPLOW(step2[5] - step2[10], bd);

-  output[11] = WRAPLOW(step2[4] - step2[11], bd);

-  output[12] = WRAPLOW(step2[3] - step2[12], bd);

-  output[13] = WRAPLOW(step2[2] - step2[13], bd);

-  output[14] = WRAPLOW(step2[1] - step2[14], bd);

-  output[15] = WRAPLOW(step2[0] - step2[15], bd);

-}

-void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

-                                    int stride, int bd) {

-  tran_low_t out[16 * 16];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[16], temp_out[16];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // First transform rows.

-  for (i = 0; i < 16; ++i) {

-    vp9_highbd_idct16(input, outptr, bd);

-    input += 16;

-    outptr += 16;

-  }

-  // Then transform columns.

-  for (i = 0; i < 16; ++i) {

-    for (j = 0; j < 16; ++j)

-      temp_in[j] = out[j * 16 + i];

-    vp9_highbd_idct16(temp_in, temp_out, bd);

-    for (j = 0; j < 16; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-    }

-  }

-}

-static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,

-                           int bd) {

-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

-  tran_high_t s9, s10, s11, s12, s13, s14, s15;

-  tran_low_t x0 = input[15];

-  tran_low_t x1 = input[0];

-  tran_low_t x2 = input[13];

-  tran_low_t x3 = input[2];

-  tran_low_t x4 = input[11];

-  tran_low_t x5 = input[4];

-  tran_low_t x6 = input[9];

-  tran_low_t x7 = input[6];

-  tran_low_t x8 = input[7];

-  tran_low_t x9 = input[8];

-  tran_low_t x10 = input[5];

-  tran_low_t x11 = input[10];

-  tran_low_t x12 = input[3];

-  tran_low_t x13 = input[12];

-  tran_low_t x14 = input[1];

-  tran_low_t x15 = input[14];

-  (void) bd;

-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

-    memset(output, 0, 16 * sizeof(*output));

-    return;

-  }

-  // stage 1

-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);

-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);

-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);

-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);

-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);

-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);

-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);

-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);

-  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);

-  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);

-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);

-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);

-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);

-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);

-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);

-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);

-  // stage 2

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4;

-  s5 = x5;

-  s6 = x6;

-  s7 = x7;

-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

-  x0 = WRAPLOW(s0 + s4, bd);

-  x1 = WRAPLOW(s1 + s5, bd);

-  x2 = WRAPLOW(s2 + s6, bd);

-  x3 = WRAPLOW(s3 + s7, bd);

-  x4 = WRAPLOW(s0 - s4, bd);

-  x5 = WRAPLOW(s1 - s5, bd);

-  x6 = WRAPLOW(s2 - s6, bd);

-  x7 = WRAPLOW(s3 - s7, bd);

-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);

-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);

-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);

-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);

-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);

-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);

-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);

-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);

-  // stage 3

-  s0 = x0;

-  s1 = x1;

-  s2 = x2;

-  s3 = x3;

-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

-  s8 = x8;

-  s9 = x9;

-  s10 = x10;

-  s11 = x11;

-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

-  x0 = WRAPLOW(s0 + s2, bd);

-  x1 = WRAPLOW(s1 + s3, bd);

-  x2 = WRAPLOW(s0 - s2, bd);

-  x3 = WRAPLOW(s1 - s3, bd);

-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);

-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);

-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);

-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);

-  x8 = WRAPLOW(s8 + s10, bd);

-  x9 = WRAPLOW(s9 + s11, bd);

-  x10 = WRAPLOW(s8 - s10, bd);

-  x11 = WRAPLOW(s9 - s11, bd);

-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);

-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);

-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);

-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);

-  // stage 4

-  s2 = (- cospi_16_64) * (x2 + x3);

-  s3 = cospi_16_64 * (x2 - x3);

-  s6 = cospi_16_64 * (x6 + x7);

-  s7 = cospi_16_64 * (-x6 + x7);

-  s10 = cospi_16_64 * (x10 + x11);

-  s11 = cospi_16_64 * (-x10 + x11);

-  s14 = (- cospi_16_64) * (x14 + x15);

-  s15 = cospi_16_64 * (x14 - x15);

-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);

-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);

-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);

-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);

-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);

-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);

-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);

-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);

-  output[0] = WRAPLOW(x0, bd);

-  output[1] = WRAPLOW(-x8, bd);

-  output[2] = WRAPLOW(x12, bd);

-  output[3] = WRAPLOW(-x4, bd);

-  output[4] = WRAPLOW(x6, bd);

-  output[5] = WRAPLOW(x14, bd);

-  output[6] = WRAPLOW(x10, bd);

-  output[7] = WRAPLOW(x2, bd);

-  output[8] = WRAPLOW(x3, bd);

-  output[9] = WRAPLOW(x11, bd);

-  output[10] = WRAPLOW(x15, bd);

-  output[11] = WRAPLOW(x7, bd);

-  output[12] = WRAPLOW(x5, bd);

-  output[13] = WRAPLOW(-x13, bd);

-  output[14] = WRAPLOW(x9, bd);

-  output[15] = WRAPLOW(-x1, bd);

-}

 static const highbd_transform_2d HIGH_IHT_16[] = {

-  { vp9_highbd_idct16,  vp9_highbd_idct16  },  // DCT_DCT  = 0

-  { highbd_iadst16, vp9_highbd_idct16  },  // ADST_DCT = 1

-  { vp9_highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2

-  { highbd_iadst16, highbd_iadst16 }   // ADST_ADST = 3

+  { vp9_highbd_idct16_c,  vp9_highbd_idct16_c  },  // DCT_DCT  = 0

+  { highbd_iadst16_c, vp9_highbd_idct16_c  },  // ADST_DCT = 1

+  { vp9_highbd_idct16_c,  highbd_iadst16_c },  // DCT_ADST = 2

+  { highbd_iadst16_c, highbd_iadst16_c }   // ADST_ADST = 3

};

 void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

@@ -2274,504 +308,6 @@

       dest[j * stride + i] = highbd_clip_pixel_add(

           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-  }

-}

-void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,

-                                   int stride, int bd) {

-  tran_low_t out[16 * 16] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[16], temp_out[16];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // First transform rows. Since all non-zero dct coefficients are in

-  // upper-left 4x4 area, we only need to calculate first 4 rows here.

-  for (i = 0; i < 4; ++i) {

-    vp9_highbd_idct16(input, outptr, bd);

-    input += 16;

-    outptr += 16;

-  }

-  // Then transform columns.

-  for (i = 0; i < 16; ++i) {

-    for (j = 0; j < 16; ++j)

-      temp_in[j] = out[j*16 + i];

-    vp9_highbd_idct16(temp_in, temp_out, bd);

-    for (j = 0; j < 16; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-    }

-  }

-}

-void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,

-                                  int stride, int bd) {

-  int i, j;

-  tran_high_t a1;

-  tran_low_t out = WRAPLOW(

-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

-  a1 = ROUND_POWER_OF_TWO(out, 6);

-  for (j = 0; j < 16; ++j) {

-    for (i = 0; i < 16; ++i)

-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

-    dest += stride;

-  }

-}

-static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {

-  tran_low_t step1[32], step2[32];

-  tran_high_t temp1, temp2;

-  (void) bd;

-  // stage 1

-  step1[0] = input[0];

-  step1[1] = input[16];

-  step1[2] = input[8];

-  step1[3] = input[24];

-  step1[4] = input[4];

-  step1[5] = input[20];

-  step1[6] = input[12];

-  step1[7] = input[28];

-  step1[8] = input[2];

-  step1[9] = input[18];

-  step1[10] = input[10];

-  step1[11] = input[26];

-  step1[12] = input[6];

-  step1[13] = input[22];

-  step1[14] = input[14];

-  step1[15] = input[30];

-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  // stage 2

-  step2[0] = step1[0];

-  step2[1] = step1[1];

-  step2[2] = step1[2];

-  step2[3] = step1[3];

-  step2[4] = step1[4];

-  step2[5] = step1[5];

-  step2[6] = step1[6];

-  step2[7] = step1[7];

-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);

-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);

-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);

-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);

-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);

-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);

-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);

-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);

-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);

-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);

-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);

-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);

-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);

-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);

-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);

-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);

-  // stage 3

-  step1[0] = step2[0];

-  step1[1] = step2[1];

-  step1[2] = step2[2];

-  step1[3] = step2[3];

-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);

-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);

-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);

-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);

-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);

-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);

-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);

-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);

-  step1[16] = step2[16];

-  step1[31] = step2[31];

-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[19] = step2[19];

-  step1[20] = step2[20];

-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[23] = step2[23];

-  step1[24] = step2[24];

-  step1[27] = step2[27];

-  step1[28] = step2[28];

-  // stage 4

-  temp1 = (step1[0] + step1[1]) * cospi_16_64;

-  temp2 = (step1[0] - step1[1]) * cospi_16_64;

-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);

-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);

-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);

-  step2[8] = step1[8];

-  step2[15] = step1[15];

-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step2[11] = step1[11];

-  step2[12] = step1[12];

-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);

-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);

-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);

-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);

-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);

-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);

-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);

-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);

-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);

-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);

-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);

-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);

-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);

-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);

-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);

-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);

-  // stage 5

-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);

-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);

-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);

-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);

-  step1[4] = step2[4];

-  temp1 = (step2[6] - step2[5]) * cospi_16_64;

-  temp2 = (step2[5] + step2[6]) * cospi_16_64;

-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[7] = step2[7];

-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);

-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);

-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);

-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);

-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);

-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);

-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);

-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);

-  step1[16] = step2[16];

-  step1[17] = step2[17];

-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[22] = step2[22];

-  step1[23] = step2[23];

-  step1[24] = step2[24];

-  step1[25] = step2[25];

-  step1[30] = step2[30];

-  step1[31] = step2[31];

-  // stage 6

-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);

-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);

-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);

-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);

-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);

-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);

-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);

-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);

-  step2[8] = step1[8];

-  step2[9] = step1[9];

-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

-  temp2 = (step1[10] + step1[13]) * cospi_16_64;

-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

-  temp2 = (step1[11] + step1[12]) * cospi_16_64;

-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step2[14] = step1[14];

-  step2[15] = step1[15];

-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);

-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);

-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);

-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);

-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);

-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);

-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);

-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);

-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);

-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);

-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);

-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);

-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);

-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);

-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);

-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);

-  // stage 7

-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);

-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);

-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);

-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);

-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);

-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);

-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);

-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);

-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);

-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);

-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);

-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);

-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);

-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);

-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);

-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);

-  step1[16] = step2[16];

-  step1[17] = step2[17];

-  step1[18] = step2[18];

-  step1[19] = step2[19];

-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

-  temp2 = (step2[20] + step2[27]) * cospi_16_64;

-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

-  temp2 = (step2[21] + step2[26]) * cospi_16_64;

-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

-  temp2 = (step2[22] + step2[25]) * cospi_16_64;

-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

-  temp2 = (step2[23] + step2[24]) * cospi_16_64;

-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

-  step1[28] = step2[28];

-  step1[29] = step2[29];

-  step1[30] = step2[30];

-  step1[31] = step2[31];

-  // final stage

-  output[0] = WRAPLOW(step1[0] + step1[31], bd);

-  output[1] = WRAPLOW(step1[1] + step1[30], bd);

-  output[2] = WRAPLOW(step1[2] + step1[29], bd);

-  output[3] = WRAPLOW(step1[3] + step1[28], bd);

-  output[4] = WRAPLOW(step1[4] + step1[27], bd);

-  output[5] = WRAPLOW(step1[5] + step1[26], bd);

-  output[6] = WRAPLOW(step1[6] + step1[25], bd);

-  output[7] = WRAPLOW(step1[7] + step1[24], bd);

-  output[8] = WRAPLOW(step1[8] + step1[23], bd);

-  output[9] = WRAPLOW(step1[9] + step1[22], bd);

-  output[10] = WRAPLOW(step1[10] + step1[21], bd);

-  output[11] = WRAPLOW(step1[11] + step1[20], bd);

-  output[12] = WRAPLOW(step1[12] + step1[19], bd);

-  output[13] = WRAPLOW(step1[13] + step1[18], bd);

-  output[14] = WRAPLOW(step1[14] + step1[17], bd);

-  output[15] = WRAPLOW(step1[15] + step1[16], bd);

-  output[16] = WRAPLOW(step1[15] - step1[16], bd);

-  output[17] = WRAPLOW(step1[14] - step1[17], bd);

-  output[18] = WRAPLOW(step1[13] - step1[18], bd);

-  output[19] = WRAPLOW(step1[12] - step1[19], bd);

-  output[20] = WRAPLOW(step1[11] - step1[20], bd);

-  output[21] = WRAPLOW(step1[10] - step1[21], bd);

-  output[22] = WRAPLOW(step1[9] - step1[22], bd);

-  output[23] = WRAPLOW(step1[8] - step1[23], bd);

-  output[24] = WRAPLOW(step1[7] - step1[24], bd);

-  output[25] = WRAPLOW(step1[6] - step1[25], bd);

-  output[26] = WRAPLOW(step1[5] - step1[26], bd);

-  output[27] = WRAPLOW(step1[4] - step1[27], bd);

-  output[28] = WRAPLOW(step1[3] - step1[28], bd);

-  output[29] = WRAPLOW(step1[2] - step1[29], bd);

-  output[30] = WRAPLOW(step1[1] - step1[30], bd);

-  output[31] = WRAPLOW(step1[0] - step1[31], bd);

-}

-void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,

-                                     int stride, int bd) {

-  tran_low_t out[32 * 32];

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[32], temp_out[32];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // Rows

-  for (i = 0; i < 32; ++i) {

-    tran_low_t zero_coeff[16];

-    for (j = 0; j < 16; ++j)

-      zero_coeff[j] = input[2 * j] | input[2 * j + 1];

-    for (j = 0; j < 8; ++j)

-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

-    for (j = 0; j < 4; ++j)

-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

-    for (j = 0; j < 2; ++j)

-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

-    if (zero_coeff[0] | zero_coeff[1])

-      highbd_idct32(input, outptr, bd);

-    else

-      memset(outptr, 0, sizeof(tran_low_t) * 32);

-    input += 32;

-    outptr += 32;

-  }

-  // Columns

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 32; ++j)

-      temp_in[j] = out[j * 32 + i];

-    highbd_idct32(temp_in, temp_out, bd);

-    for (j = 0; j < 32; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-    }

-  }

-}

-void vp9_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,

-                                   int stride, int bd) {

-  tran_low_t out[32 * 32] = {0};

-  tran_low_t *outptr = out;

-  int i, j;

-  tran_low_t temp_in[32], temp_out[32];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  // Rows

-  // Only upper-left 8x8 has non-zero coeff.

-  for (i = 0; i < 8; ++i) {

-    highbd_idct32(input, outptr, bd);

-    input += 32;

-    outptr += 32;

-  }

-  // Columns

-  for (i = 0; i < 32; ++i) {

-    for (j = 0; j < 32; ++j)

-      temp_in[j] = out[j * 32 + i];

-    highbd_idct32(temp_in, temp_out, bd);

-    for (j = 0; j < 32; ++j) {

-      dest[j * stride + i] = highbd_clip_pixel_add(

-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-    }

-  }

-}

-void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,

-                                  int stride, int bd) {

-  int i, j;

-  int a1;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  tran_low_t out = WRAPLOW(

-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

-  a1 = ROUND_POWER_OF_TWO(out, 6);

-  for (j = 0; j < 32; ++j) {

-    for (i = 0; i < 32; ++i)

-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

-    dest += stride;

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -14,63 +14,16 @@

 #include <assert.h>

 #include "./vpx_config.h"

-#include "vpx_dsp/txfm_common.h"

-#if CONFIG_VP9_HIGHBITDEPTH

-#include "vpx_dsp/vpx_dsp_common.h"

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-#include "vpx_ports/mem.h"

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_enums.h"

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_dsp/txfm_common.h"

+#include "vpx_ports/mem.h"

 #ifdef __cplusplus

 extern "C" {

 #endif

-static INLINE tran_low_t check_range(tran_high_t input) {

-#if CONFIG_COEFFICIENT_RANGE_CHECKING

-  // For valid VP9 input streams, intermediate stage coefficients should always

-  // stay within the range of a signed 16 bit integer. Coefficients can go out

-  // of this range for invalid/corrupt VP9 streams. However, strictly checking

-  // this range for every intermediate coefficient can burdensome for a decoder,

-  // therefore the following assertion is only enabled when configured with

-  // --enable-coefficient-range-checking.

-  assert(INT16_MIN <= input);

-  assert(input <= INT16_MAX);

-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

-  return (tran_low_t)input;

-}

-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {

-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

-  return check_range(rv);

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-static INLINE tran_low_t highbd_check_range(tran_high_t input,

-                                            int bd) {

-#if CONFIG_COEFFICIENT_RANGE_CHECKING

-  // For valid highbitdepth VP9 streams, intermediate stage coefficients will

-  // stay within the ranges:

-  // - 8 bit: signed 16 bit integer

-  // - 10 bit: signed 18 bit integer

-  // - 12 bit: signed 20 bit integer

-  const int32_t int_max = (1 << (7 + bd)) - 1;

-  const int32_t int_min = -int_max - 1;

-  assert(int_min <= input);

-  assert(input <= int_max);

-  (void) int_min;

-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

-  (void) bd;

-  return (tran_low_t)input;

-}

-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,

-                                                      int bd) {

-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

-  return highbd_check_range(rv, bd);

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

 typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);

 typedef struct {

@@ -85,28 +38,6 @@

 } highbd_transform_2d;

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-#if CONFIG_EMULATE_HARDWARE

-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a

-// non-normative method to handle overflows. A stream that causes

-// overflows  in the inverse transform is considered invalid in VP9,

-// and a hardware implementer is free to choose any reasonable

-// method to handle overflows. However to aid in hardware

-// verification they can use a specific implementation of the

-// WRAPLOW() macro below that is identical to their intended

-// hardware implementation (and also use configure options to trigger

-// the C-implementation of the transform).

-//

-// The particular WRAPLOW implementation below performs strict

-// overflow wrapping to match common hardware implementations.

-// bd of 8 uses trans_low with 16bits, need to remove 16bits

-// bd of 10 uses trans_low with 18bits, need to remove 14bits

-// bd of 12 uses trans_low with 20bits, need to remove 12bits

-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits

-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))

-#else

-#define WRAPLOW(x, bd) ((int32_t)(x))

-#endif  // CONFIG_EMULATE_HARDWARE

 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

                      int eob);

 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

@@ -126,9 +57,6 @@

                       int stride, int eob);

 #if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);

-void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);

-void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);

 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

                             int eob, int bd);

 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

@@ -145,11 +73,6 @@

                            uint8_t *dest, int stride, int eob, int bd);

 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

                              uint8_t *dest, int stride, int eob, int bd);

-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,

-                                             int bd) {

-  trans = WRAPLOW(trans, bd);

-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);

-}

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #ifdef __cplusplus

 }  // extern "C"

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -87,39 +87,6 @@

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   # Note as optimized versions of these functions are added we need to add a check to ensure

   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.

-  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct4x4_1_add/;

-  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct4x4_16_add/;

-  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct8x8_1_add/;

-  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct8x8_64_add/;

-  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct8x8_12_add/;

-  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct16x16_1_add/;

-  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct16x16_256_add/;

-  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct16x16_10_add/;

-  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct32x32_1024_add/;

-  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct32x32_34_add/;

-  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_idct32x32_1_add/;

   add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";

   specialize qw/vp9_iht4x4_16_add/;

@@ -128,51 +95,9 @@

   add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";

   specialize qw/vp9_iht16x16_256_add/;

-  # dct and add

-  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_iwht4x4_1_add/;

-  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-  specialize qw/vp9_iwht4x4_16_add/;

 } else {

   # Force C versions if CONFIG_EMULATE_HARDWARE is 1

   if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {

-    add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct4x4_1_add/;

-    add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct4x4_16_add/;

-    add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct8x8_1_add/;

-    add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct8x8_64_add/;

-    add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct8x8_12_add/;

-    add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct16x16_1_add/;

-    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct16x16_256_add/;

-    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct16x16_10_add/;

-    add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct32x32_1024_add/;

-    add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct32x32_34_add/;

-    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct32x32_1_add/;

     add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";

     specialize qw/vp9_iht4x4_16_add/;

@@ -181,50 +106,7 @@

     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";

     specialize qw/vp9_iht16x16_256_add/;

-    # dct and add

-    add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_iwht4x4_1_add/;

-    add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_iwht4x4_16_add/;

   } else {

-    add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct4x4_1_add sse2 neon dspr2 msa/;

-    add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct4x4_16_add sse2 neon dspr2 msa/;

-    add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/;

-    add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";

-    add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";

-    add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;

-    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/;

-    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/;

-    add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/;

-    add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/;

-    #is this a typo?

-    $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;

-    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/;

     add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";

     specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;

@@ -233,14 +115,6 @@

     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";

     specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;

-    # dct and add

-    add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_iwht4x4_1_add msa/;

-    add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";

@@ -295,24 +169,6 @@

   # Note as optimized versions of these functions are added we need to add a check to ensure

   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.

-  add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_idct4x4_1_add/;

-  add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_idct8x8_1_add/;

-  add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_idct16x16_1_add/;

-  add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_idct32x32_1024_add/;

-  add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_idct32x32_34_add/;

-  add_proto qw/void vp9_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_idct32x32_1_add/;

   add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";

   specialize qw/vp9_highbd_iht4x4_16_add/;

@@ -321,50 +177,6 @@

   add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";

   specialize qw/vp9_highbd_iht16x16_256_add/;

-  # dct and add

-  add_proto qw/void vp9_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_iwht4x4_1_add/;

-  add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-  specialize qw/vp9_highbd_iwht4x4_16_add/;

-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1

-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {

-    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct4x4_16_add/;

-    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct8x8_64_add/;

-    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct8x8_10_add/;

-    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct16x16_256_add/;

-    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct16x16_10_add/;

-  } else {

-    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct4x4_16_add sse2/;

-    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct8x8_64_add sse2/;

-    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct8x8_10_add sse2/;

-    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct16x16_256_add sse2/;

-    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vp9_highbd_idct16x16_10_add sse2/;

-  }

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -8,260 +8,10 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "./vp9_rtcd.h"

-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

 #include "vpx_ports/mem.h"

-#define RECON_AND_STORE4X4(dest, in_x) \

-{                                                     \

-  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \

-  d0 = _mm_unpacklo_epi8(d0, zero); \

-  d0 = _mm_add_epi16(in_x, d0); \

-  d0 = _mm_packus_epi16(d0, d0); \

-  *(int *)(dest) = _mm_cvtsi128_si32(d0); \

-}

-void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i eight = _mm_set1_epi16(8);

-  const __m128i cst = _mm_setr_epi16(

-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,

-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  __m128i input0, input1, input2, input3;

-  // Rows

-  input0 = _mm_load_si128((const __m128i *)input);

-  input2 = _mm_load_si128((const __m128i *)(input + 8));

-  // Construct i3, i1, i3, i1, i2, i0, i2, i0

-  input0 = _mm_shufflelo_epi16(input0, 0xd8);

-  input0 = _mm_shufflehi_epi16(input0, 0xd8);

-  input2 = _mm_shufflelo_epi16(input2, 0xd8);

-  input2 = _mm_shufflehi_epi16(input2, 0xd8);

-  input1 = _mm_unpackhi_epi32(input0, input0);

-  input0 = _mm_unpacklo_epi32(input0, input0);

-  input3 = _mm_unpackhi_epi32(input2, input2);

-  input2 = _mm_unpacklo_epi32(input2, input2);

-  // Stage 1

-  input0 = _mm_madd_epi16(input0, cst);

-  input1 = _mm_madd_epi16(input1, cst);

-  input2 = _mm_madd_epi16(input2, cst);

-  input3 = _mm_madd_epi16(input3, cst);

-  input0 = _mm_add_epi32(input0, rounding);

-  input1 = _mm_add_epi32(input1, rounding);

-  input2 = _mm_add_epi32(input2, rounding);

-  input3 = _mm_add_epi32(input3, rounding);

-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

-  // Stage 2

-  input0 = _mm_packs_epi32(input0, input1);

-  input1 = _mm_packs_epi32(input2, input3);

-  // Transpose

-  input2 = _mm_unpacklo_epi16(input0, input1);

-  input3 = _mm_unpackhi_epi16(input0, input1);

-  input0 = _mm_unpacklo_epi32(input2, input3);

-  input1 = _mm_unpackhi_epi32(input2, input3);

-  // Switch column2, column 3, and then, we got:

-  // input2: column1, column 0;  input3: column2, column 3.

-  input1 = _mm_shuffle_epi32(input1, 0x4e);

-  input2 = _mm_add_epi16(input0, input1);

-  input3 = _mm_sub_epi16(input0, input1);

-  // Columns

-  // Construct i3, i1, i3, i1, i2, i0, i2, i0

-  input0 = _mm_unpacklo_epi32(input2, input2);

-  input1 = _mm_unpackhi_epi32(input2, input2);

-  input2 = _mm_unpackhi_epi32(input3, input3);

-  input3 = _mm_unpacklo_epi32(input3, input3);

-  // Stage 1

-  input0 = _mm_madd_epi16(input0, cst);

-  input1 = _mm_madd_epi16(input1, cst);

-  input2 = _mm_madd_epi16(input2, cst);

-  input3 = _mm_madd_epi16(input3, cst);

-  input0 = _mm_add_epi32(input0, rounding);

-  input1 = _mm_add_epi32(input1, rounding);

-  input2 = _mm_add_epi32(input2, rounding);

-  input3 = _mm_add_epi32(input3, rounding);

-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

-  // Stage 2

-  input0 = _mm_packs_epi32(input0, input2);

-  input1 = _mm_packs_epi32(input1, input3);

-  // Transpose

-  input2 = _mm_unpacklo_epi16(input0, input1);

-  input3 = _mm_unpackhi_epi16(input0, input1);

-  input0 = _mm_unpacklo_epi32(input2, input3);

-  input1 = _mm_unpackhi_epi32(input2, input3);

-  // Switch column2, column 3, and then, we got:

-  // input2: column1, column 0;  input3: column2, column 3.

-  input1 = _mm_shuffle_epi32(input1, 0x4e);

-  input2 = _mm_add_epi16(input0, input1);

-  input3 = _mm_sub_epi16(input0, input1);

-  // Final round and shift

-  input2 = _mm_add_epi16(input2, eight);

-  input3 = _mm_add_epi16(input3, eight);

-  input2 = _mm_srai_epi16(input2, 4);

-  input3 = _mm_srai_epi16(input3, 4);

-  // Reconstruction and Store

-  {

-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));

-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));

-    d0 = _mm_unpacklo_epi32(d0,

-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));

-    d2 = _mm_unpacklo_epi32(

-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);

-    d0 = _mm_unpacklo_epi8(d0, zero);

-    d2 = _mm_unpacklo_epi8(d2, zero);

-    d0 = _mm_add_epi16(d0, input2);

-    d2 = _mm_add_epi16(d2, input3);

-    d0 = _mm_packus_epi16(d0, d2);

-    // store input0

-    *(int *)dest = _mm_cvtsi128_si32(d0);

-    // store input1

-    d0 = _mm_srli_si128(d0, 4);

-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);

-    // store input2

-    d0 = _mm_srli_si128(d0, 4);

-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);

-    // store input3

-    d0 = _mm_srli_si128(d0, 4);

-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);

-  }

-}

-void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

-  __m128i dc_value;

-  const __m128i zero = _mm_setzero_si128();

-  int a;

-  a = dct_const_round_shift(input[0] * cospi_16_64);

-  a = dct_const_round_shift(a * cospi_16_64);

-  a = ROUND_POWER_OF_TWO(a, 4);

-  dc_value = _mm_set1_epi16(a);

-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);

-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);

-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);

-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);

-}

-static INLINE void transpose_4x4(__m128i *res) {

-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);

-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);

-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);

-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);

-}

-static void idct4_sse2(__m128i *in) {

-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  __m128i u[8], v[8];

-  transpose_4x4(in);

-  // stage 1

-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);

-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);

-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);

-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  u[0] = _mm_packs_epi32(v[0], v[1]);

-  u[1] = _mm_packs_epi32(v[3], v[2]);

-  // stage 2

-  in[0] = _mm_add_epi16(u[0], u[1]);

-  in[1] = _mm_sub_epi16(u[0], u[1]);

-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);

-}

-static void iadst4_sse2(__m128i *in) {

-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);

-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);

-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);

-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);

-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);

-  const __m128i kZero = _mm_set1_epi16(0);

-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  __m128i u[8], v[8], in7;

-  transpose_4x4(in);

-  in7 = _mm_srli_si128(in[1], 8);

-  in7 = _mm_add_epi16(in7, in[0]);

-  in7 = _mm_sub_epi16(in7, in[1]);

-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);

-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);

-  u[2] = _mm_unpacklo_epi16(in7, kZero);

-  u[3] = _mm_unpackhi_epi16(in[0], kZero);

-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3

-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5

-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2

-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4

-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6

-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2

-  u[0] = _mm_add_epi32(v[0], v[1]);

-  u[1] = _mm_add_epi32(v[3], v[4]);

-  u[2] = v[2];

-  u[3] = _mm_add_epi32(u[0], u[1]);

-  u[4] = _mm_slli_epi32(v[5], 2);

-  u[5] = _mm_add_epi32(u[3], v[5]);

-  u[6] = _mm_sub_epi32(u[5], u[4]);

-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

-  in[0] = _mm_packs_epi32(u[0], u[1]);

-  in[1] = _mm_packs_epi32(u[2], u[3]);

-}

 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,

                             int tx_type) {

   __m128i in[2];

@@ -327,537 +77,6 @@

-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \

-                      out0, out1, out2, out3, out4, out5, out6, out7) \

-  {                                                     \

-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \

-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \

-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \

-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \

-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \

-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \

-                                                        \

-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \

-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \

-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \

-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \

-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \

-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \

-                                                            \

-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \

-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \

-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \

-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \

-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \

-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \

-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \

-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \

-  }

-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \

-                         out0, out1, out2, out3) \

-  {                                              \

-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \

-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \

-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \

-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \

-    \

-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \

-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \

-    \

-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \

-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \

-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \

-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \

-  }

-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \

-  {                                            \

-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

-  }

-// Define Macro for multiplying elements by constants and adding them together.

-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \

-                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \

-  {   \

-      tmp0 = _mm_madd_epi16(lo_0, cst0); \

-      tmp1 = _mm_madd_epi16(hi_0, cst0); \

-      tmp2 = _mm_madd_epi16(lo_0, cst1); \

-      tmp3 = _mm_madd_epi16(hi_0, cst1); \

-      tmp4 = _mm_madd_epi16(lo_1, cst2); \

-      tmp5 = _mm_madd_epi16(hi_1, cst2); \

-      tmp6 = _mm_madd_epi16(lo_1, cst3); \

-      tmp7 = _mm_madd_epi16(hi_1, cst3); \

-      \

-      tmp0 = _mm_add_epi32(tmp0, rounding); \

-      tmp1 = _mm_add_epi32(tmp1, rounding); \

-      tmp2 = _mm_add_epi32(tmp2, rounding); \

-      tmp3 = _mm_add_epi32(tmp3, rounding); \

-      tmp4 = _mm_add_epi32(tmp4, rounding); \

-      tmp5 = _mm_add_epi32(tmp5, rounding); \

-      tmp6 = _mm_add_epi32(tmp6, rounding); \

-      tmp7 = _mm_add_epi32(tmp7, rounding); \

-      \

-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

-      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \

-      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \

-      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \

-      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \

-      \

-      res0 = _mm_packs_epi32(tmp0, tmp1); \

-      res1 = _mm_packs_epi32(tmp2, tmp3); \

-      res2 = _mm_packs_epi32(tmp4, tmp5); \

-      res3 = _mm_packs_epi32(tmp6, tmp7); \

-  }

-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \

-  {   \

-      tmp0 = _mm_madd_epi16(lo_0, cst0); \

-      tmp1 = _mm_madd_epi16(hi_0, cst0); \

-      tmp2 = _mm_madd_epi16(lo_0, cst1); \

-      tmp3 = _mm_madd_epi16(hi_0, cst1); \

-      \

-      tmp0 = _mm_add_epi32(tmp0, rounding); \

-      tmp1 = _mm_add_epi32(tmp1, rounding); \

-      tmp2 = _mm_add_epi32(tmp2, rounding); \

-      tmp3 = _mm_add_epi32(tmp3, rounding); \

-      \

-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

-      \

-      res0 = _mm_packs_epi32(tmp0, tmp1); \

-      res1 = _mm_packs_epi32(tmp2, tmp3); \

-  }

-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \

-                 out0, out1, out2, out3, out4, out5, out6, out7)  \

-  { \

-  /* Stage1 */      \

-  { \

-    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \

-    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \

-    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \

-    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \

-    \

-    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \

-                          stg1_1, stg1_2, stg1_3, stp1_4,      \

-                          stp1_7, stp1_5, stp1_6)              \

-  } \

-    \

-  /* Stage2 */ \

-  { \

-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \

-    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \

-    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \

-    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \

-    \

-    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \

-                           stg2_1, stg2_2, stg2_3, stp2_0,     \

-                           stp2_1, stp2_2, stp2_3)             \

-    \

-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \

-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \

-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \

-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \

-  } \

-    \

-  /* Stage3 */ \

-  { \

-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

-    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

-    \

-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \

-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \

-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \

-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \

-    \

-    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \

-    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \

-    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \

-    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \

-    \

-    tmp0 = _mm_add_epi32(tmp0, rounding); \

-    tmp1 = _mm_add_epi32(tmp1, rounding); \

-    tmp2 = _mm_add_epi32(tmp2, rounding); \

-    tmp3 = _mm_add_epi32(tmp3, rounding); \

-    \

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

-    \

-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

-  } \

-  \

-  /* Stage4  */ \

-  out0 = _mm_adds_epi16(stp1_0, stp2_7); \

-  out1 = _mm_adds_epi16(stp1_1, stp1_6); \

-  out2 = _mm_adds_epi16(stp1_2, stp1_5); \

-  out3 = _mm_adds_epi16(stp1_3, stp2_4); \

-  out4 = _mm_subs_epi16(stp1_3, stp2_4); \

-  out5 = _mm_subs_epi16(stp1_2, stp1_5); \

-  out6 = _mm_subs_epi16(stp1_1, stp1_6); \

-  out7 = _mm_subs_epi16(stp1_0, stp2_7); \

-  }

-void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);

-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-  int i;

-  // Load input data.

-  in0 = _mm_load_si128((const __m128i *)input);

-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));

-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));

-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));

-  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));

-  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));

-  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));

-  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));

-  // 2-D

-  for (i = 0; i < 2; i++) {

-    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()

-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,

-                  in0, in1, in2, in3, in4, in5, in6, in7);

-    // 4-stage 1D idct8x8

-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,

-          in0, in1, in2, in3, in4, in5, in6, in7);

-  }

-  // Final rounding and shift

-  in0 = _mm_adds_epi16(in0, final_rounding);

-  in1 = _mm_adds_epi16(in1, final_rounding);

-  in2 = _mm_adds_epi16(in2, final_rounding);

-  in3 = _mm_adds_epi16(in3, final_rounding);

-  in4 = _mm_adds_epi16(in4, final_rounding);

-  in5 = _mm_adds_epi16(in5, final_rounding);

-  in6 = _mm_adds_epi16(in6, final_rounding);

-  in7 = _mm_adds_epi16(in7, final_rounding);

-  in0 = _mm_srai_epi16(in0, 5);

-  in1 = _mm_srai_epi16(in1, 5);

-  in2 = _mm_srai_epi16(in2, 5);

-  in3 = _mm_srai_epi16(in3, 5);

-  in4 = _mm_srai_epi16(in4, 5);

-  in5 = _mm_srai_epi16(in5, 5);

-  in6 = _mm_srai_epi16(in6, 5);

-  in7 = _mm_srai_epi16(in7, 5);

-  RECON_AND_STORE(dest + 0 * stride, in0);

-  RECON_AND_STORE(dest + 1 * stride, in1);

-  RECON_AND_STORE(dest + 2 * stride, in2);

-  RECON_AND_STORE(dest + 3 * stride, in3);

-  RECON_AND_STORE(dest + 4 * stride, in4);

-  RECON_AND_STORE(dest + 5 * stride, in5);

-  RECON_AND_STORE(dest + 6 * stride, in6);

-  RECON_AND_STORE(dest + 7 * stride, in7);

-}

-void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

-  __m128i dc_value;

-  const __m128i zero = _mm_setzero_si128();

-  int a;

-  a = dct_const_round_shift(input[0] * cospi_16_64);

-  a = dct_const_round_shift(a * cospi_16_64);

-  a = ROUND_POWER_OF_TWO(a, 5);

-  dc_value = _mm_set1_epi16(a);

-  RECON_AND_STORE(dest + 0 * stride, dc_value);

-  RECON_AND_STORE(dest + 1 * stride, dc_value);

-  RECON_AND_STORE(dest + 2 * stride, dc_value);

-  RECON_AND_STORE(dest + 3 * stride, dc_value);

-  RECON_AND_STORE(dest + 4 * stride, dc_value);

-  RECON_AND_STORE(dest + 5 * stride, dc_value);

-  RECON_AND_STORE(dest + 6 * stride, dc_value);

-  RECON_AND_STORE(dest + 7 * stride, dc_value);

-}

-static void idct8_sse2(__m128i *in) {

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()

-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],

-                in0, in1, in2, in3, in4, in5, in6, in7);

-  // 4-stage 1D idct8x8

-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,

-        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);

-}

-static void iadst8_sse2(__m128i *in) {

-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);

-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);

-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);

-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);

-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);

-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);

-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);

-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);

-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);

-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

-  const __m128i k__const_0 = _mm_set1_epi16(0);

-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;

-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;

-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;

-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;

-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

-  // transpose

-  array_transpose_8x8(in, in);

-  // properly aligned for butterfly input

-  in0 = in[7];

-  in1 = in[0];

-  in2 = in[5];

-  in3 = in[2];

-  in4 = in[3];

-  in5 = in[4];

-  in6 = in[1];

-  in7 = in[6];

-  // column transformation

-  // stage 1

-  // interleave and multiply/add into 32-bit integer

-  s0 = _mm_unpacklo_epi16(in0, in1);

-  s1 = _mm_unpackhi_epi16(in0, in1);

-  s2 = _mm_unpacklo_epi16(in2, in3);

-  s3 = _mm_unpackhi_epi16(in2, in3);

-  s4 = _mm_unpacklo_epi16(in4, in5);

-  s5 = _mm_unpackhi_epi16(in4, in5);

-  s6 = _mm_unpacklo_epi16(in6, in7);

-  s7 = _mm_unpackhi_epi16(in6, in7);

-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);

-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);

-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);

-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);

-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);

-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);

-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);

-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);

-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);

-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);

-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);

-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);

-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);

-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);

-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);

-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);

-  // addition

-  w0 = _mm_add_epi32(u0, u8);

-  w1 = _mm_add_epi32(u1, u9);

-  w2 = _mm_add_epi32(u2, u10);

-  w3 = _mm_add_epi32(u3, u11);

-  w4 = _mm_add_epi32(u4, u12);

-  w5 = _mm_add_epi32(u5, u13);

-  w6 = _mm_add_epi32(u6, u14);

-  w7 = _mm_add_epi32(u7, u15);

-  w8 = _mm_sub_epi32(u0, u8);

-  w9 = _mm_sub_epi32(u1, u9);

-  w10 = _mm_sub_epi32(u2, u10);

-  w11 = _mm_sub_epi32(u3, u11);

-  w12 = _mm_sub_epi32(u4, u12);

-  w13 = _mm_sub_epi32(u5, u13);

-  w14 = _mm_sub_epi32(u6, u14);

-  w15 = _mm_sub_epi32(u7, u15);

-  // shift and rounding

-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);

-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);

-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);

-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);

-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);

-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);

-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);

-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);

-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);

-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);

-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);

-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);

-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);

-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);

-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);

-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);

-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);

-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);

-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);

-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);

-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);

-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);

-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);

-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);

-  // back to 16-bit and pack 8 integers into __m128i

-  in[0] = _mm_packs_epi32(u0, u1);

-  in[1] = _mm_packs_epi32(u2, u3);

-  in[2] = _mm_packs_epi32(u4, u5);

-  in[3] = _mm_packs_epi32(u6, u7);

-  in[4] = _mm_packs_epi32(u8, u9);

-  in[5] = _mm_packs_epi32(u10, u11);

-  in[6] = _mm_packs_epi32(u12, u13);

-  in[7] = _mm_packs_epi32(u14, u15);

-  // stage 2

-  s0 = _mm_add_epi16(in[0], in[2]);

-  s1 = _mm_add_epi16(in[1], in[3]);

-  s2 = _mm_sub_epi16(in[0], in[2]);

-  s3 = _mm_sub_epi16(in[1], in[3]);

-  u0 = _mm_unpacklo_epi16(in[4], in[5]);

-  u1 = _mm_unpackhi_epi16(in[4], in[5]);

-  u2 = _mm_unpacklo_epi16(in[6], in[7]);

-  u3 = _mm_unpackhi_epi16(in[6], in[7]);

-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);

-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);

-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);

-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);

-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);

-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);

-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);

-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);

-  w0 = _mm_add_epi32(v0, v4);

-  w1 = _mm_add_epi32(v1, v5);

-  w2 = _mm_add_epi32(v2, v6);

-  w3 = _mm_add_epi32(v3, v7);

-  w4 = _mm_sub_epi32(v0, v4);

-  w5 = _mm_sub_epi32(v1, v5);

-  w6 = _mm_sub_epi32(v2, v6);

-  w7 = _mm_sub_epi32(v3, v7);

-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);

-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);

-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);

-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);

-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);

-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);

-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);

-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);

-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

-  // back to 16-bit intergers

-  s4 = _mm_packs_epi32(u0, u1);

-  s5 = _mm_packs_epi32(u2, u3);

-  s6 = _mm_packs_epi32(u4, u5);

-  s7 = _mm_packs_epi32(u6, u7);

-  // stage 3

-  u0 = _mm_unpacklo_epi16(s2, s3);

-  u1 = _mm_unpackhi_epi16(s2, s3);

-  u2 = _mm_unpacklo_epi16(s6, s7);

-  u3 = _mm_unpackhi_epi16(s6, s7);

-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);

-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);

-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);

-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);

-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);

-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);

-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);

-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);

-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);

-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);

-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);

-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);

-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);

-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);

-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);

-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);

-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);

-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);

-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);

-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);

-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);

-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);

-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);

-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);

-  s2 = _mm_packs_epi32(v0, v1);

-  s3 = _mm_packs_epi32(v2, v3);

-  s6 = _mm_packs_epi32(v4, v5);

-  s7 = _mm_packs_epi32(v6, v7);

-  in[0] = s0;

-  in[1] = _mm_sub_epi16(k__const_0, s4);

-  in[2] = s6;

-  in[3] = _mm_sub_epi16(k__const_0, s2);

-  in[4] = s3;

-  in[5] = _mm_sub_epi16(k__const_0, s7);

-  in[6] = s5;

-  in[7] = _mm_sub_epi16(k__const_0, s1);

-}

 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,

                             int tx_type) {

   __m128i in[8];

@@ -925,1366 +144,6 @@

   RECON_AND_STORE(dest + 7 * stride, in[7]);

-void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);

-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-  // Rows. Load 4-row input data.

-  in0 = _mm_load_si128((const __m128i *)input);

-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));

-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));

-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));

-  // 8x4 Transpose

-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);

-  // Stage1

-  {

-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);

-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);

-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);

-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);

-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);

-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);

-    tmp0 = _mm_add_epi32(tmp0, rounding);

-    tmp2 = _mm_add_epi32(tmp2, rounding);

-    tmp4 = _mm_add_epi32(tmp4, rounding);

-    tmp6 = _mm_add_epi32(tmp6, rounding);

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);

-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);

-  }

-  // Stage2

-  {

-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);

-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);

-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);

-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);

-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);

-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);

-    tmp0 = _mm_add_epi32(tmp0, rounding);

-    tmp2 = _mm_add_epi32(tmp2, rounding);

-    tmp4 = _mm_add_epi32(tmp4, rounding);

-    tmp6 = _mm_add_epi32(tmp6, rounding);

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);

-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);

-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);

-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);

-    stp2_4 = tmp0;

-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);

-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);

-  }

-  // Stage3

-  {

-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);

-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);

-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);

-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);

-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);

-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);

-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0

-    tmp0 = _mm_add_epi32(tmp0, rounding);

-    tmp2 = _mm_add_epi32(tmp2, rounding);

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);

-  }

-  // Stage4

-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);

-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);

-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);

-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);

-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)

-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,

-        in0, in1, in2, in3, in4, in5, in6, in7);

-  // Final rounding and shift

-  in0 = _mm_adds_epi16(in0, final_rounding);

-  in1 = _mm_adds_epi16(in1, final_rounding);

-  in2 = _mm_adds_epi16(in2, final_rounding);

-  in3 = _mm_adds_epi16(in3, final_rounding);

-  in4 = _mm_adds_epi16(in4, final_rounding);

-  in5 = _mm_adds_epi16(in5, final_rounding);

-  in6 = _mm_adds_epi16(in6, final_rounding);

-  in7 = _mm_adds_epi16(in7, final_rounding);

-  in0 = _mm_srai_epi16(in0, 5);

-  in1 = _mm_srai_epi16(in1, 5);

-  in2 = _mm_srai_epi16(in2, 5);

-  in3 = _mm_srai_epi16(in3, 5);

-  in4 = _mm_srai_epi16(in4, 5);

-  in5 = _mm_srai_epi16(in5, 5);

-  in6 = _mm_srai_epi16(in6, 5);

-  in7 = _mm_srai_epi16(in7, 5);

-  RECON_AND_STORE(dest + 0 * stride, in0);

-  RECON_AND_STORE(dest + 1 * stride, in1);

-  RECON_AND_STORE(dest + 2 * stride, in2);

-  RECON_AND_STORE(dest + 3 * stride, in3);

-  RECON_AND_STORE(dest + 4 * stride, in4);

-  RECON_AND_STORE(dest + 5 * stride, in5);

-  RECON_AND_STORE(dest + 6 * stride, in6);

-  RECON_AND_STORE(dest + 7 * stride, in7);

-}

-#define IDCT16 \

-  /* Stage2 */ \

-  { \

-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \

-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \

-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \

-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \

-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \

-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \

-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \

-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \

-    \

-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \

-                           stg2_0, stg2_1, stg2_2, stg2_3, \

-                           stp2_8, stp2_15, stp2_9, stp2_14) \

-    \

-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \

-                           stg2_4, stg2_5, stg2_6, stg2_7, \

-                           stp2_10, stp2_13, stp2_11, stp2_12) \

-  } \

-    \

-  /* Stage3 */ \

-  { \

-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \

-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \

-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \

-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \

-    \

-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \

-                           stg3_0, stg3_1, stg3_2, stg3_3, \

-                           stp1_4, stp1_7, stp1_5, stp1_6) \

-    \

-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \

-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \

-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \

-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \

-    \

-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \

-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \

-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \

-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \

-  } \

-  \

-  /* Stage4 */ \

-  { \

-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \

-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \

-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \

-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \

-    \

-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \

-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \

-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

-    \

-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \

-                           stg4_0, stg4_1, stg4_2, stg4_3, \

-                           stp2_0, stp2_1, stp2_2, stp2_3) \

-    \

-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \

-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \

-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \

-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \

-    \

-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \

-                           stg4_4, stg4_5, stg4_6, stg4_7, \

-                           stp2_9, stp2_14, stp2_10, stp2_13) \

-  } \

-    \

-  /* Stage5 */ \

-  { \

-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

-    \

-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \

-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \

-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \

-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \

-    \

-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

-    \

-    tmp0 = _mm_add_epi32(tmp0, rounding); \

-    tmp1 = _mm_add_epi32(tmp1, rounding); \

-    tmp2 = _mm_add_epi32(tmp2, rounding); \

-    tmp3 = _mm_add_epi32(tmp3, rounding); \

-    \

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

-    \

-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

-    \

-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \

-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \

-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \

-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \

-    \

-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \

-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \

-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \

-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \

-  } \

-    \

-  /* Stage6 */ \

-  { \

-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

-    \

-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \

-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \

-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \

-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \

-    \

-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

-                           stg6_0, stg4_0, stg6_0, stg4_0, \

-                           stp2_10, stp2_13, stp2_11, stp2_12) \

-  }

-#define IDCT16_10 \

-    /* Stage2 */ \

-    { \

-      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \

-      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \

-      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \

-      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \

-      \

-      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \

-                             stg2_0, stg2_1, stg2_6, stg2_7, \

-                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \

-    } \

-      \

-    /* Stage3 */ \

-    { \

-      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \

-      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \

-      \

-      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \

-                               stg3_0, stg3_1,  \

-                               stp2_4, stp2_7) \

-      \

-      stp1_9  =  stp1_8_0; \

-      stp1_10 =  stp1_11;  \

-      \

-      stp1_13 = stp1_12_0; \

-      stp1_14 = stp1_15;   \

-    } \

-    \

-    /* Stage4 */ \

-    { \

-      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \

-      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \

-      \

-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \

-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \

-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

-      \

-      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \

-                               stg4_0, stg4_1, \

-                               stp1_0, stp1_1) \

-      stp2_5 = stp2_4; \

-      stp2_6 = stp2_7; \

-      \

-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \

-                             stg4_4, stg4_5, stg4_6, stg4_7, \

-                             stp2_9, stp2_14, stp2_10, stp2_13) \

-    } \

-      \

-    /* Stage5 */ \

-    { \

-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

-      \

-      stp1_2 = stp1_1; \

-      stp1_3 = stp1_0; \

-      \

-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

-      \

-      tmp0 = _mm_add_epi32(tmp0, rounding); \

-      tmp1 = _mm_add_epi32(tmp1, rounding); \

-      tmp2 = _mm_add_epi32(tmp2, rounding); \

-      tmp3 = _mm_add_epi32(tmp3, rounding); \

-      \

-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

-      \

-      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

-      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

-      \

-      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \

-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \

-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \

-      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \

-      \

-      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \

-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \

-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \

-      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \

-    } \

-      \

-    /* Stage6 */ \

-    { \

-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

-      \

-      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \

-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

-      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \

-      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \

-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

-      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \

-      \

-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

-                             stg6_0, stg4_0, stg6_0, stg4_0, \

-                             stp2_10, stp2_13, stp2_11, stp2_12) \

-    }

-void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,

-                                int stride) {

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);

-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);

-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);

-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);

-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);

-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);

-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);

-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

-  __m128i in[16], l[16], r[16], *curr1;

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

-          stp1_8_0, stp1_12_0;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;

-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-  int i;

-  curr1 = l;

-  for (i = 0; i < 2; i++) {

-    // 1-D idct

-    // Load input data.

-    in[0] = _mm_load_si128((const __m128i *)input);

-    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));

-    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));

-    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));

-    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));

-    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));

-    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));

-    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));

-    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));

-    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));

-    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));

-    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));

-    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));

-    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));

-    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));

-    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));

-    array_transpose_8x8(in, in);

-    array_transpose_8x8(in + 8, in + 8);

-    IDCT16

-    // Stage7

-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);

-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);

-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);

-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);

-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);

-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);

-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);

-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);

-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);

-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);

-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);

-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);

-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);

-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);

-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);

-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);

-    curr1 = r;

-    input += 128;

-  }

-  for (i = 0; i < 2; i++) {

-    int j;

-    // 1-D idct

-    array_transpose_8x8(l + i * 8, in);

-    array_transpose_8x8(r + i * 8, in + 8);

-    IDCT16

-    // 2-D

-    in[0] = _mm_add_epi16(stp2_0, stp1_15);

-    in[1] = _mm_add_epi16(stp2_1, stp1_14);

-    in[2] = _mm_add_epi16(stp2_2, stp2_13);

-    in[3] = _mm_add_epi16(stp2_3, stp2_12);

-    in[4] = _mm_add_epi16(stp2_4, stp2_11);

-    in[5] = _mm_add_epi16(stp2_5, stp2_10);

-    in[6] = _mm_add_epi16(stp2_6, stp1_9);

-    in[7] = _mm_add_epi16(stp2_7, stp1_8);

-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);

-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);

-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);

-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);

-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);

-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);

-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);

-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);

-    for (j = 0; j < 16; ++j) {

-      // Final rounding and shift

-      in[j] = _mm_adds_epi16(in[j], final_rounding);

-      in[j] = _mm_srai_epi16(in[j], 6);

-      RECON_AND_STORE(dest + j * stride, in[j]);

-    }

-    dest += 8;

-  }

-}

-void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

-  __m128i dc_value;

-  const __m128i zero = _mm_setzero_si128();

-  int a, i;

-  a = dct_const_round_shift(input[0] * cospi_16_64);

-  a = dct_const_round_shift(a * cospi_16_64);

-  a = ROUND_POWER_OF_TWO(a, 6);

-  dc_value = _mm_set1_epi16(a);

-  for (i = 0; i < 2; ++i) {

-    RECON_AND_STORE(dest +  0 * stride, dc_value);

-    RECON_AND_STORE(dest +  1 * stride, dc_value);

-    RECON_AND_STORE(dest +  2 * stride, dc_value);

-    RECON_AND_STORE(dest +  3 * stride, dc_value);

-    RECON_AND_STORE(dest +  4 * stride, dc_value);

-    RECON_AND_STORE(dest +  5 * stride, dc_value);

-    RECON_AND_STORE(dest +  6 * stride, dc_value);

-    RECON_AND_STORE(dest +  7 * stride, dc_value);

-    RECON_AND_STORE(dest +  8 * stride, dc_value);

-    RECON_AND_STORE(dest +  9 * stride, dc_value);

-    RECON_AND_STORE(dest + 10 * stride, dc_value);

-    RECON_AND_STORE(dest + 11 * stride, dc_value);

-    RECON_AND_STORE(dest + 12 * stride, dc_value);

-    RECON_AND_STORE(dest + 13 * stride, dc_value);

-    RECON_AND_STORE(dest + 14 * stride, dc_value);

-    RECON_AND_STORE(dest + 15 * stride, dc_value);

-    dest += 8;

-  }

-}

-static void iadst16_8col(__m128i *in) {

-  // perform 16x16 1-D ADST for 8 columns

-  __m128i s[16], x[16], u[32], v[32];

-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);

-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);

-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);

-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);

-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);

-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);

-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);

-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);

-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);

-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);

-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);

-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);

-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);

-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);

-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);

-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);

-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);

-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);

-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);

-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);

-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);

-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);

-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);

-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i kZero = _mm_set1_epi16(0);

-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);

-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);

-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);

-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);

-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);

-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);

-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);

-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);

-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);

-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);

-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);

-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);

-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);

-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);

-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);

-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);

-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);

-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);

-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);

-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);

-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);

-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);

-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);

-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);

-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);

-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);

-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);

-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);

-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);

-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);

-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);

-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);

-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);

-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);

-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);

-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);

-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);

-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);

-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);

-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);

-  u[0] = _mm_add_epi32(v[0], v[16]);

-  u[1] = _mm_add_epi32(v[1], v[17]);

-  u[2] = _mm_add_epi32(v[2], v[18]);

-  u[3] = _mm_add_epi32(v[3], v[19]);

-  u[4] = _mm_add_epi32(v[4], v[20]);

-  u[5] = _mm_add_epi32(v[5], v[21]);

-  u[6] = _mm_add_epi32(v[6], v[22]);

-  u[7] = _mm_add_epi32(v[7], v[23]);

-  u[8] = _mm_add_epi32(v[8], v[24]);

-  u[9] = _mm_add_epi32(v[9], v[25]);

-  u[10] = _mm_add_epi32(v[10], v[26]);

-  u[11] = _mm_add_epi32(v[11], v[27]);

-  u[12] = _mm_add_epi32(v[12], v[28]);

-  u[13] = _mm_add_epi32(v[13], v[29]);

-  u[14] = _mm_add_epi32(v[14], v[30]);

-  u[15] = _mm_add_epi32(v[15], v[31]);

-  u[16] = _mm_sub_epi32(v[0], v[16]);

-  u[17] = _mm_sub_epi32(v[1], v[17]);

-  u[18] = _mm_sub_epi32(v[2], v[18]);

-  u[19] = _mm_sub_epi32(v[3], v[19]);

-  u[20] = _mm_sub_epi32(v[4], v[20]);

-  u[21] = _mm_sub_epi32(v[5], v[21]);

-  u[22] = _mm_sub_epi32(v[6], v[22]);

-  u[23] = _mm_sub_epi32(v[7], v[23]);

-  u[24] = _mm_sub_epi32(v[8], v[24]);

-  u[25] = _mm_sub_epi32(v[9], v[25]);

-  u[26] = _mm_sub_epi32(v[10], v[26]);

-  u[27] = _mm_sub_epi32(v[11], v[27]);

-  u[28] = _mm_sub_epi32(v[12], v[28]);

-  u[29] = _mm_sub_epi32(v[13], v[29]);

-  u[30] = _mm_sub_epi32(v[14], v[30]);

-  u[31] = _mm_sub_epi32(v[15], v[31]);

-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);

-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);

-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);

-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);

-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);

-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);

-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);

-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);

-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);

-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);

-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);

-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);

-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);

-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);

-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);

-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);

-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);

-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);

-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);

-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);

-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);

-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);

-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);

-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);

-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);

-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);

-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);

-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);

-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);

-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);

-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);

-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);

-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);

-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);

-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);

-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);

-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);

-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);

-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);

-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);

-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);

-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);

-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);

-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);

-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);

-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);

-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);

-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);

-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);

-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);

-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);

-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);

-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);

-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);

-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);

-  s[0] = _mm_packs_epi32(u[0], u[1]);

-  s[1] = _mm_packs_epi32(u[2], u[3]);

-  s[2] = _mm_packs_epi32(u[4], u[5]);

-  s[3] = _mm_packs_epi32(u[6], u[7]);

-  s[4] = _mm_packs_epi32(u[8], u[9]);

-  s[5] = _mm_packs_epi32(u[10], u[11]);

-  s[6] = _mm_packs_epi32(u[12], u[13]);

-  s[7] = _mm_packs_epi32(u[14], u[15]);

-  s[8] = _mm_packs_epi32(u[16], u[17]);

-  s[9] = _mm_packs_epi32(u[18], u[19]);

-  s[10] = _mm_packs_epi32(u[20], u[21]);

-  s[11] = _mm_packs_epi32(u[22], u[23]);

-  s[12] = _mm_packs_epi32(u[24], u[25]);

-  s[13] = _mm_packs_epi32(u[26], u[27]);

-  s[14] = _mm_packs_epi32(u[28], u[29]);

-  s[15] = _mm_packs_epi32(u[30], u[31]);

-  // stage 2

-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);

-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);

-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);

-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);

-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);

-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);

-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);

-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);

-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);

-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);

-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);

-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);

-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);

-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);

-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);

-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);

-  u[0] = _mm_add_epi32(v[0], v[8]);

-  u[1] = _mm_add_epi32(v[1], v[9]);

-  u[2] = _mm_add_epi32(v[2], v[10]);

-  u[3] = _mm_add_epi32(v[3], v[11]);

-  u[4] = _mm_add_epi32(v[4], v[12]);

-  u[5] = _mm_add_epi32(v[5], v[13]);

-  u[6] = _mm_add_epi32(v[6], v[14]);

-  u[7] = _mm_add_epi32(v[7], v[15]);

-  u[8] = _mm_sub_epi32(v[0], v[8]);

-  u[9] = _mm_sub_epi32(v[1], v[9]);

-  u[10] = _mm_sub_epi32(v[2], v[10]);

-  u[11] = _mm_sub_epi32(v[3], v[11]);

-  u[12] = _mm_sub_epi32(v[4], v[12]);

-  u[13] = _mm_sub_epi32(v[5], v[13]);

-  u[14] = _mm_sub_epi32(v[6], v[14]);

-  u[15] = _mm_sub_epi32(v[7], v[15]);

-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);

-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);

-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);

-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);

-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);

-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);

-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);

-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);

-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);

-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);

-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);

-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);

-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);

-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);

-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);

-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);

-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);

-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);

-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);

-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);

-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);

-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);

-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);

-  x[0] = _mm_add_epi16(s[0], s[4]);

-  x[1] = _mm_add_epi16(s[1], s[5]);

-  x[2] = _mm_add_epi16(s[2], s[6]);

-  x[3] = _mm_add_epi16(s[3], s[7]);

-  x[4] = _mm_sub_epi16(s[0], s[4]);

-  x[5] = _mm_sub_epi16(s[1], s[5]);

-  x[6] = _mm_sub_epi16(s[2], s[6]);

-  x[7] = _mm_sub_epi16(s[3], s[7]);

-  x[8] = _mm_packs_epi32(u[0], u[1]);

-  x[9] = _mm_packs_epi32(u[2], u[3]);

-  x[10] = _mm_packs_epi32(u[4], u[5]);

-  x[11] = _mm_packs_epi32(u[6], u[7]);

-  x[12] = _mm_packs_epi32(u[8], u[9]);

-  x[13] = _mm_packs_epi32(u[10], u[11]);

-  x[14] = _mm_packs_epi32(u[12], u[13]);

-  x[15] = _mm_packs_epi32(u[14], u[15]);

-  // stage 3

-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);

-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);

-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);

-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);

-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);

-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);

-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);

-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);

-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);

-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);

-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);

-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);

-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);

-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);

-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);

-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);

-  u[0] = _mm_add_epi32(v[0], v[4]);

-  u[1] = _mm_add_epi32(v[1], v[5]);

-  u[2] = _mm_add_epi32(v[2], v[6]);

-  u[3] = _mm_add_epi32(v[3], v[7]);

-  u[4] = _mm_sub_epi32(v[0], v[4]);

-  u[5] = _mm_sub_epi32(v[1], v[5]);

-  u[6] = _mm_sub_epi32(v[2], v[6]);

-  u[7] = _mm_sub_epi32(v[3], v[7]);

-  u[8] = _mm_add_epi32(v[8], v[12]);

-  u[9] = _mm_add_epi32(v[9], v[13]);

-  u[10] = _mm_add_epi32(v[10], v[14]);

-  u[11] = _mm_add_epi32(v[11], v[15]);

-  u[12] = _mm_sub_epi32(v[8], v[12]);

-  u[13] = _mm_sub_epi32(v[9], v[13]);

-  u[14] = _mm_sub_epi32(v[10], v[14]);

-  u[15] = _mm_sub_epi32(v[11], v[15]);

-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);

-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);

-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);

-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);

-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);

-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);

-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);

-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);

-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);

-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);

-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);

-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);

-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

-  s[0] = _mm_add_epi16(x[0], x[2]);

-  s[1] = _mm_add_epi16(x[1], x[3]);

-  s[2] = _mm_sub_epi16(x[0], x[2]);

-  s[3] = _mm_sub_epi16(x[1], x[3]);

-  s[4] = _mm_packs_epi32(v[0], v[1]);

-  s[5] = _mm_packs_epi32(v[2], v[3]);

-  s[6] = _mm_packs_epi32(v[4], v[5]);

-  s[7] = _mm_packs_epi32(v[6], v[7]);

-  s[8] = _mm_add_epi16(x[8], x[10]);

-  s[9] = _mm_add_epi16(x[9], x[11]);

-  s[10] = _mm_sub_epi16(x[8], x[10]);

-  s[11] = _mm_sub_epi16(x[9], x[11]);

-  s[12] = _mm_packs_epi32(v[8], v[9]);

-  s[13] = _mm_packs_epi32(v[10], v[11]);

-  s[14] = _mm_packs_epi32(v[12], v[13]);

-  s[15] = _mm_packs_epi32(v[14], v[15]);

-  // stage 4

-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);

-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);

-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);

-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);

-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);

-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);

-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);

-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);

-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);

-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);

-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);

-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);

-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);

-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);

-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);

-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);

-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);

-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);

-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);

-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);

-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);

-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);

-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);

-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);

-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

-  in[0] = s[0];

-  in[1] = _mm_sub_epi16(kZero, s[8]);

-  in[2] = s[12];

-  in[3] = _mm_sub_epi16(kZero, s[4]);

-  in[4] = _mm_packs_epi32(v[4], v[5]);

-  in[5] = _mm_packs_epi32(v[12], v[13]);

-  in[6] = _mm_packs_epi32(v[8], v[9]);

-  in[7] = _mm_packs_epi32(v[0], v[1]);

-  in[8] = _mm_packs_epi32(v[2], v[3]);

-  in[9] = _mm_packs_epi32(v[10], v[11]);

-  in[10] = _mm_packs_epi32(v[14], v[15]);

-  in[11] = _mm_packs_epi32(v[6], v[7]);

-  in[12] = s[5];

-  in[13] = _mm_sub_epi16(kZero, s[13]);

-  in[14] = s[9];

-  in[15] = _mm_sub_epi16(kZero, s[1]);

-}

-static void idct16_8col(__m128i *in) {

-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);

-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);

-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);

-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);

-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);

-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);

-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);

-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);

-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);

-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);

-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);

-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  __m128i v[16], u[16], s[16], t[16];

-  // stage 1

-  s[0] = in[0];

-  s[1] = in[8];

-  s[2] = in[4];

-  s[3] = in[12];

-  s[4] = in[2];

-  s[5] = in[10];

-  s[6] = in[6];

-  s[7] = in[14];

-  s[8] = in[1];

-  s[9] = in[9];

-  s[10] = in[5];

-  s[11] = in[13];

-  s[12] = in[3];

-  s[13] = in[11];

-  s[14] = in[7];

-  s[15] = in[15];

-  // stage 2

-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);

-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);

-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);

-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);

-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);

-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);

-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);

-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);

-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);

-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);

-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);

-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);

-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);

-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);

-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);

-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);

-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);

-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);

-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);

-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);

-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);

-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);

-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);

-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

-  s[8]  = _mm_packs_epi32(u[0], u[1]);

-  s[15] = _mm_packs_epi32(u[2], u[3]);

-  s[9]  = _mm_packs_epi32(u[4], u[5]);

-  s[14] = _mm_packs_epi32(u[6], u[7]);

-  s[10] = _mm_packs_epi32(u[8], u[9]);

-  s[13] = _mm_packs_epi32(u[10], u[11]);

-  s[11] = _mm_packs_epi32(u[12], u[13]);

-  s[12] = _mm_packs_epi32(u[14], u[15]);

-  // stage 3

-  t[0] = s[0];

-  t[1] = s[1];

-  t[2] = s[2];

-  t[3] = s[3];

-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);

-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);

-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);

-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);

-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

-  t[4] = _mm_packs_epi32(u[0], u[1]);

-  t[7] = _mm_packs_epi32(u[2], u[3]);

-  t[5] = _mm_packs_epi32(u[4], u[5]);

-  t[6] = _mm_packs_epi32(u[6], u[7]);

-  t[8] = _mm_add_epi16(s[8], s[9]);

-  t[9] = _mm_sub_epi16(s[8], s[9]);

-  t[10] = _mm_sub_epi16(s[11], s[10]);

-  t[11] = _mm_add_epi16(s[10], s[11]);

-  t[12] = _mm_add_epi16(s[12], s[13]);

-  t[13] = _mm_sub_epi16(s[12], s[13]);

-  t[14] = _mm_sub_epi16(s[15], s[14]);

-  t[15] = _mm_add_epi16(s[14], s[15]);

-  // stage 4

-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);

-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);

-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);

-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);

-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);

-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);

-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);

-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);

-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);

-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);

-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);

-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);

-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);

-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);

-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);

-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);

-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);

-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);

-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);

-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);

-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);

-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);

-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);

-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

-  s[0] = _mm_packs_epi32(u[0], u[1]);

-  s[1] = _mm_packs_epi32(u[2], u[3]);

-  s[2] = _mm_packs_epi32(u[4], u[5]);

-  s[3] = _mm_packs_epi32(u[6], u[7]);

-  s[4] = _mm_add_epi16(t[4], t[5]);

-  s[5] = _mm_sub_epi16(t[4], t[5]);

-  s[6] = _mm_sub_epi16(t[7], t[6]);

-  s[7] = _mm_add_epi16(t[6], t[7]);

-  s[8] = t[8];

-  s[15] = t[15];

-  s[9]  = _mm_packs_epi32(u[8], u[9]);

-  s[14] = _mm_packs_epi32(u[10], u[11]);

-  s[10] = _mm_packs_epi32(u[12], u[13]);

-  s[13] = _mm_packs_epi32(u[14], u[15]);

-  s[11] = t[11];

-  s[12] = t[12];

-  // stage 5

-  t[0] = _mm_add_epi16(s[0], s[3]);

-  t[1] = _mm_add_epi16(s[1], s[2]);

-  t[2] = _mm_sub_epi16(s[1], s[2]);

-  t[3] = _mm_sub_epi16(s[0], s[3]);

-  t[4] = s[4];

-  t[7] = s[7];

-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);

-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);

-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  t[5] = _mm_packs_epi32(u[0], u[1]);

-  t[6] = _mm_packs_epi32(u[2], u[3]);

-  t[8] = _mm_add_epi16(s[8], s[11]);

-  t[9] = _mm_add_epi16(s[9], s[10]);

-  t[10] = _mm_sub_epi16(s[9], s[10]);

-  t[11] = _mm_sub_epi16(s[8], s[11]);

-  t[12] = _mm_sub_epi16(s[15], s[12]);

-  t[13] = _mm_sub_epi16(s[14], s[13]);

-  t[14] = _mm_add_epi16(s[13], s[14]);

-  t[15] = _mm_add_epi16(s[12], s[15]);

-  // stage 6

-  s[0] = _mm_add_epi16(t[0], t[7]);

-  s[1] = _mm_add_epi16(t[1], t[6]);

-  s[2] = _mm_add_epi16(t[2], t[5]);

-  s[3] = _mm_add_epi16(t[3], t[4]);

-  s[4] = _mm_sub_epi16(t[3], t[4]);

-  s[5] = _mm_sub_epi16(t[2], t[5]);

-  s[6] = _mm_sub_epi16(t[1], t[6]);

-  s[7] = _mm_sub_epi16(t[0], t[7]);

-  s[8] = t[8];

-  s[9] = t[9];

-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);

-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);

-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);

-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);

-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);

-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);

-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);

-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);

-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);

-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);

-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);

-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

-  s[10] = _mm_packs_epi32(u[0], u[1]);

-  s[13] = _mm_packs_epi32(u[2], u[3]);

-  s[11] = _mm_packs_epi32(u[4], u[5]);

-  s[12] = _mm_packs_epi32(u[6], u[7]);

-  s[14] = t[14];

-  s[15] = t[15];

-  // stage 7

-  in[0] = _mm_add_epi16(s[0], s[15]);

-  in[1] = _mm_add_epi16(s[1], s[14]);

-  in[2] = _mm_add_epi16(s[2], s[13]);

-  in[3] = _mm_add_epi16(s[3], s[12]);

-  in[4] = _mm_add_epi16(s[4], s[11]);

-  in[5] = _mm_add_epi16(s[5], s[10]);

-  in[6] = _mm_add_epi16(s[6], s[9]);

-  in[7] = _mm_add_epi16(s[7], s[8]);

-  in[8] = _mm_sub_epi16(s[7], s[8]);

-  in[9] = _mm_sub_epi16(s[6], s[9]);

-  in[10] = _mm_sub_epi16(s[5], s[10]);

-  in[11] = _mm_sub_epi16(s[4], s[11]);

-  in[12] = _mm_sub_epi16(s[3], s[12]);

-  in[13] = _mm_sub_epi16(s[2], s[13]);

-  in[14] = _mm_sub_epi16(s[1], s[14]);

-  in[15] = _mm_sub_epi16(s[0], s[15]);

-}

-static void idct16_sse2(__m128i *in0, __m128i *in1) {

-  array_transpose_16x16(in0, in1);

-  idct16_8col(in0);

-  idct16_8col(in1);

-}

-static void iadst16_sse2(__m128i *in0, __m128i *in1) {

-  array_transpose_16x16(in0, in1);

-  iadst16_8col(in0);

-  iadst16_8col(in1);

-}

 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,

                                int tx_type) {

   __m128i in0[16], in1[16];

@@ -2319,1905 +178,3 @@

   dest += 8;

   write_buffer_8x16(dest, in1, stride);

-void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,

-                               int stride) {

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);

-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

-  __m128i in[16], l[16];

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,

-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

-          stp1_8_0, stp1_12_0;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;

-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-  int i;

-  // First 1-D inverse DCT

-  // Load input data.

-  in[0] = _mm_load_si128((const __m128i *)input);

-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));

-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));

-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));

-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);

-  // Stage2

-  {

-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);

-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);

-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);

-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);

-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);

-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);

-    tmp0 = _mm_add_epi32(tmp0, rounding);

-    tmp2 = _mm_add_epi32(tmp2, rounding);

-    tmp5 = _mm_add_epi32(tmp5, rounding);

-    tmp7 = _mm_add_epi32(tmp7, rounding);

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);

-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);

-    stp2_8  = _mm_packs_epi32(tmp0, tmp2);

-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);

-  }

-  // Stage3

-  {

-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);

-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);

-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);

-    tmp0 = _mm_add_epi32(tmp0, rounding);

-    tmp2 = _mm_add_epi32(tmp2, rounding);

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);

-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);

-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);

-  }

-  // Stage4

-  {

-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);

-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);

-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);

-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);

-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);

-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);

-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);

-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);

-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);

-    tmp0 = _mm_add_epi32(tmp0, rounding);

-    tmp2 = _mm_add_epi32(tmp2, rounding);

-    tmp1 = _mm_add_epi32(tmp1, rounding);

-    tmp3 = _mm_add_epi32(tmp3, rounding);

-    tmp5 = _mm_add_epi32(tmp5, rounding);

-    tmp7 = _mm_add_epi32(tmp7, rounding);

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);

-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);

-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);

-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);

-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);

-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);

-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);

-  }

-  // Stage5 and Stage6

-  {

-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);

-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);

-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);

-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);

-    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);

-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);

-    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);

-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);

-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);

-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);

-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);

-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);

-  }

-  // Stage6

-  {

-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);

-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);

-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);

-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);

-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);

-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);

-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);

-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);

-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);

-    tmp1 = _mm_add_epi32(tmp1, rounding);

-    tmp3 = _mm_add_epi32(tmp3, rounding);

-    tmp0 = _mm_add_epi32(tmp0, rounding);

-    tmp2 = _mm_add_epi32(tmp2, rounding);

-    tmp4 = _mm_add_epi32(tmp4, rounding);

-    tmp6 = _mm_add_epi32(tmp6, rounding);

-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);

-    stp2_10 = _mm_packs_epi32(tmp0, zero);

-    stp2_13 = _mm_packs_epi32(tmp2, zero);

-    stp2_11 = _mm_packs_epi32(tmp4, zero);

-    stp2_12 = _mm_packs_epi32(tmp6, zero);

-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);

-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);

-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);

-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);

-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);

-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);

-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);

-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);

-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);

-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);

-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);

-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);

-  }

-  // Stage7. Left 8x16 only.

-  l[0] = _mm_add_epi16(stp2_0, stp1_15);

-  l[1] = _mm_add_epi16(stp2_1, stp1_14);

-  l[2] = _mm_add_epi16(stp2_2, stp2_13);

-  l[3] = _mm_add_epi16(stp2_3, stp2_12);

-  l[4] = _mm_add_epi16(stp2_4, stp2_11);

-  l[5] = _mm_add_epi16(stp2_5, stp2_10);

-  l[6] = _mm_add_epi16(stp2_6, stp1_9);

-  l[7] = _mm_add_epi16(stp2_7, stp1_8);

-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);

-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);

-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);

-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);

-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);

-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);

-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);

-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);

-  // Second 1-D inverse transform, performed per 8x16 block

-  for (i = 0; i < 2; i++) {

-    int j;

-    array_transpose_4X8(l + 8 * i, in);

-    IDCT16_10

-    // Stage7

-    in[0] = _mm_add_epi16(stp2_0, stp1_15);

-    in[1] = _mm_add_epi16(stp2_1, stp1_14);

-    in[2] = _mm_add_epi16(stp2_2, stp2_13);

-    in[3] = _mm_add_epi16(stp2_3, stp2_12);

-    in[4] = _mm_add_epi16(stp2_4, stp2_11);

-    in[5] = _mm_add_epi16(stp2_5, stp2_10);

-    in[6] = _mm_add_epi16(stp2_6, stp1_9);

-    in[7] = _mm_add_epi16(stp2_7, stp1_8);

-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);

-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);

-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);

-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);

-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);

-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);

-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);

-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);

-    for (j = 0; j < 16; ++j) {

-      // Final rounding and shift

-      in[j] = _mm_adds_epi16(in[j], final_rounding);

-      in[j] = _mm_srai_epi16(in[j], 6);

-      RECON_AND_STORE(dest + j * stride, in[j]);

-    }

-    dest += 8;

-  }

-}

-#define LOAD_DQCOEFF(reg, input) \

-  {  \

-    reg = _mm_load_si128((const __m128i *) input); \

-    input += 8; \

-  }  \

-#define IDCT32_34 \

-/* Stage1 */ \

-{ \

-  const __m128i zero = _mm_setzero_si128();\

-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \

-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \

-  \

-  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \

-  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \

-  \

-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \

-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \

-  \

-  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \

-  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \

-  \

-  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \

-                         stg1_1, stp1_16, stp1_31); \

-  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \

-                         stg1_7, stp1_19, stp1_28); \

-  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \

-                         stg1_9, stp1_20, stp1_27); \

-  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \

-                         stg1_15, stp1_23, stp1_24); \

-} \

-\

-/* Stage2 */ \

-{ \

-  const __m128i zero = _mm_setzero_si128();\

-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \

-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \

-  \

-  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \

-  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \

-  \

-  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \

-                         stg2_1, stp2_8, stp2_15); \

-  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \

-                         stg2_7, stp2_11, stp2_12); \

-  \

-  stp2_16 = stp1_16; \

-  stp2_19 = stp1_19; \

-  \

-  stp2_20 = stp1_20; \

-  stp2_23 = stp1_23; \

-  \

-  stp2_24 = stp1_24; \

-  stp2_27 = stp1_27; \

-  \

-  stp2_28 = stp1_28; \

-  stp2_31 = stp1_31; \

-} \

-\

-/* Stage3 */ \

-{ \

-  const __m128i zero = _mm_setzero_si128();\

-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \

-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \

-  \

-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \

-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \

-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \

-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \

-  \

-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \

-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \

-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \

-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \

-  \

-  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \

-                         stg3_1, stp1_4, stp1_7); \

-  \

-  stp1_8 = stp2_8; \

-  stp1_11 = stp2_11; \

-  stp1_12 = stp2_12; \

-  stp1_15 = stp2_15; \

-  \

-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \

-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \

-                         stp1_18, stp1_29) \

-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \

-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \

-                         stp1_22, stp1_25) \

-  \

-  stp1_16 = stp2_16; \

-  stp1_31 = stp2_31; \

-  stp1_19 = stp2_19; \

-  stp1_20 = stp2_20; \

-  stp1_23 = stp2_23; \

-  stp1_24 = stp2_24; \

-  stp1_27 = stp2_27; \

-  stp1_28 = stp2_28; \

-} \

-\

-/* Stage4 */ \

-{ \

-  const __m128i zero = _mm_setzero_si128();\

-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \

-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \

-  \

-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \

-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \

-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \

-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \

-  \

-  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \

-                         stg4_1, stp2_0, stp2_1); \

-  \

-  stp2_4 = stp1_4; \

-  stp2_5 = stp1_4; \

-  stp2_6 = stp1_7; \

-  stp2_7 = stp1_7; \

-  \

-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \

-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \

-                         stp2_10, stp2_13) \

-  \

-  stp2_8 = stp1_8; \

-  stp2_15 = stp1_15; \

-  stp2_11 = stp1_11; \

-  stp2_12 = stp1_12; \

-  \

-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \

-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \

-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \

-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \

-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \

-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \

-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \

-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \

-  \

-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \

-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \

-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \

-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \

-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \

-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \

-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \

-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \

-} \

-\

-/* Stage5 */ \

-{ \

-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \

-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \

-  \

-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \

-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \

-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

-  \

-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

-  \

-  stp1_0 = stp2_0; \

-  stp1_1 = stp2_1; \

-  stp1_2 = stp2_1; \

-  stp1_3 = stp2_0; \

-  \

-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

-  \

-  tmp0 = _mm_add_epi32(tmp0, rounding); \

-  tmp1 = _mm_add_epi32(tmp1, rounding); \

-  tmp2 = _mm_add_epi32(tmp2, rounding); \

-  tmp3 = _mm_add_epi32(tmp3, rounding); \

-  \

-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

-  \

-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

-  \

-  stp1_4 = stp2_4; \

-  stp1_7 = stp2_7; \

-  \

-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \

-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \

-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \

-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \

-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \

-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \

-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \

-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \

-  \

-  stp1_16 = stp2_16; \

-  stp1_17 = stp2_17; \

-  \

-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \

-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \

-                         stp1_19, stp1_28) \

-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \

-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \

-                         stp1_21, stp1_26) \

-  \

-  stp1_22 = stp2_22; \

-  stp1_23 = stp2_23; \

-  stp1_24 = stp2_24; \

-  stp1_25 = stp2_25; \

-  stp1_30 = stp2_30; \

-  stp1_31 = stp2_31; \

-} \

-\

-/* Stage6 */ \

-{ \

-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

-  \

-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \

-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \

-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \

-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \

-  \

-  stp2_8 = stp1_8; \

-  stp2_9 = stp1_9; \

-  stp2_14 = stp1_14; \

-  stp2_15 = stp1_15; \

-  \

-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \

-                         stp2_13, stp2_11, stp2_12) \

-  \

-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \

-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \

-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \

-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \

-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \

-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \

-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \

-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \

-  \

-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \

-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \

-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \

-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \

-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \

-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \

-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \

-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \

-} \

-\

-/* Stage7 */ \

-{ \

-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

-  \

-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \

-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \

-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \

-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \

-  \

-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \

-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \

-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \

-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \

-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \

-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \

-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \

-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \

-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \

-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \

-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \

-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \

-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \

-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \

-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \

-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \

-  \

-  stp1_16 = stp2_16; \

-  stp1_17 = stp2_17; \

-  stp1_18 = stp2_18; \

-  stp1_19 = stp2_19; \

-  \

-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \

-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \

-                         stp1_21, stp1_26) \

-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \

-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \

-                         stp1_23, stp1_24) \

-  \

-  stp1_28 = stp2_28; \

-  stp1_29 = stp2_29; \

-  stp1_30 = stp2_30; \

-  stp1_31 = stp2_31; \

-}

-#define IDCT32 \

-/* Stage1 */ \

-{ \

-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \

-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \

-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \

-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \

-  \

-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \

-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \

-  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \

-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \

-  \

-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \

-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \

-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \

-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \

-  \

-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \

-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \

-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \

-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \

-  \

-  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \

-                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \

-                         stp1_17, stp1_30) \

-  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \

-                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \

-                         stp1_19, stp1_28) \

-  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \

-                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \

-                         stp1_21, stp1_26) \

-  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \

-                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \

-                         stp1_23, stp1_24) \

-} \

-\

-/* Stage2 */ \

-{ \

-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \

-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \

-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \

-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \

-  \

-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \

-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \

-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \

-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \

-  \

-  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \

-                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \

-                         stp2_14) \

-  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \

-                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \

-                         stp2_11, stp2_12) \

-  \

-  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \

-  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \

-  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \

-  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \

-  \

-  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \

-  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \

-  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \

-  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \

-  \

-  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \

-  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \

-  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \

-  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \

-  \

-  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \

-  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \

-  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \

-  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \

-} \

-\

-/* Stage3 */ \

-{ \

-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \

-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \

-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \

-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \

-  \

-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \

-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \

-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \

-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \

-  \

-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \

-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \

-  \

-  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \

-                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \

-                         stp1_6) \

-  \

-  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \

-  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \

-  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \

-  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \

-  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \

-  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \

-  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \

-  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \

-  \

-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \

-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \

-                         stp1_18, stp1_29) \

-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \

-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \

-                         stp1_22, stp1_25) \

-  \

-  stp1_16 = stp2_16; \

-  stp1_31 = stp2_31; \

-  stp1_19 = stp2_19; \

-  stp1_20 = stp2_20; \

-  stp1_23 = stp2_23; \

-  stp1_24 = stp2_24; \

-  stp1_27 = stp2_27; \

-  stp1_28 = stp2_28; \

-} \

-\

-/* Stage4 */ \

-{ \

-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \

-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \

-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \

-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \

-  \

-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \

-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \

-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

-  \

-  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \

-                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \

-                         stp2_2, stp2_3) \

-  \

-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \

-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \

-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \

-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \

-  \

-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \

-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \

-                         stp2_10, stp2_13) \

-  \

-  stp2_8 = stp1_8; \

-  stp2_15 = stp1_15; \

-  stp2_11 = stp1_11; \

-  stp2_12 = stp1_12; \

-  \

-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \

-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \

-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \

-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \

-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \

-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \

-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \

-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \

-  \

-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \

-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \

-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \

-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \

-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \

-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \

-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \

-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \

-} \

-\

-/* Stage5 */ \

-{ \

-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \

-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \

-  \

-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \

-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \

-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

-  \

-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

-  \

-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \

-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \

-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \

-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \

-  \

-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

-  \

-  tmp0 = _mm_add_epi32(tmp0, rounding); \

-  tmp1 = _mm_add_epi32(tmp1, rounding); \

-  tmp2 = _mm_add_epi32(tmp2, rounding); \

-  tmp3 = _mm_add_epi32(tmp3, rounding); \

-  \

-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

-  \

-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

-  \

-  stp1_4 = stp2_4; \

-  stp1_7 = stp2_7; \

-  \

-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \

-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \

-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \

-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \

-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \

-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \

-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \

-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \

-  \

-  stp1_16 = stp2_16; \

-  stp1_17 = stp2_17; \

-  \

-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \

-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \

-                         stp1_19, stp1_28) \

-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \

-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \

-                         stp1_21, stp1_26) \

-  \

-  stp1_22 = stp2_22; \

-  stp1_23 = stp2_23; \

-  stp1_24 = stp2_24; \

-  stp1_25 = stp2_25; \

-  stp1_30 = stp2_30; \

-  stp1_31 = stp2_31; \

-} \

-\

-/* Stage6 */ \

-{ \

-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

-  \

-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \

-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \

-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \

-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \

-  \

-  stp2_8 = stp1_8; \

-  stp2_9 = stp1_9; \

-  stp2_14 = stp1_14; \

-  stp2_15 = stp1_15; \

-  \

-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \

-                         stp2_13, stp2_11, stp2_12) \

-  \

-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \

-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \

-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \

-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \

-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \

-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \

-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \

-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \

-  \

-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \

-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \

-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \

-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \

-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \

-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \

-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \

-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \

-} \

-\

-/* Stage7 */ \

-{ \

-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

-  \

-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \

-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \

-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \

-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \

-  \

-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \

-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \

-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \

-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \

-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \

-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \

-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \

-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \

-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \

-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \

-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \

-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \

-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \

-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \

-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \

-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \

-  \

-  stp1_16 = stp2_16; \

-  stp1_17 = stp2_17; \

-  stp1_18 = stp2_18; \

-  stp1_19 = stp2_19; \

-  \

-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \

-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \

-                         stp1_21, stp1_26) \

-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \

-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \

-                         stp1_23, stp1_24) \

-  \

-  stp1_28 = stp2_28; \

-  stp1_29 = stp2_29; \

-  stp1_30 = stp2_30; \

-  stp1_31 = stp2_31; \

-}

-// Only upper-left 8x8 has non-zero coeff

-void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,

-                               int stride) {

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i final_rounding = _mm_set1_epi16(1<<5);

-  // idct constants for each stage

-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);

-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);

-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);

-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);

-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);

-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);

-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);

-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);

-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);

-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);

-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);

-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);

-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);

-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);

-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

-  __m128i in[32], col[32];

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,

-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,

-          stp1_30, stp1_31;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,

-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,

-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,

-          stp2_30, stp2_31;

-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-  int i;

-  // Load input data. Only need to load the top left 8x8 block.

-  in[0] = _mm_load_si128((const __m128i *)input);

-  in[1] = _mm_load_si128((const __m128i *)(input + 32));

-  in[2] = _mm_load_si128((const __m128i *)(input + 64));

-  in[3] = _mm_load_si128((const __m128i *)(input + 96));

-  in[4] = _mm_load_si128((const __m128i *)(input + 128));

-  in[5] = _mm_load_si128((const __m128i *)(input + 160));

-  in[6] = _mm_load_si128((const __m128i *)(input + 192));

-  in[7] = _mm_load_si128((const __m128i *)(input + 224));

-  for (i = 8; i < 32; ++i) {

-    in[i] = _mm_setzero_si128();

-  }

-  array_transpose_8x8(in, in);

-  // TODO(hkuang): Following transposes are unnecessary. But remove them will

-  // lead to performance drop on some devices.

-  array_transpose_8x8(in + 8, in + 8);

-  array_transpose_8x8(in + 16, in + 16);

-  array_transpose_8x8(in + 24, in + 24);

-  IDCT32_34

-  // 1_D: Store 32 intermediate results for each 8x32 block.

-  col[0] = _mm_add_epi16(stp1_0, stp1_31);

-  col[1] = _mm_add_epi16(stp1_1, stp1_30);

-  col[2] = _mm_add_epi16(stp1_2, stp1_29);

-  col[3] = _mm_add_epi16(stp1_3, stp1_28);

-  col[4] = _mm_add_epi16(stp1_4, stp1_27);

-  col[5] = _mm_add_epi16(stp1_5, stp1_26);

-  col[6] = _mm_add_epi16(stp1_6, stp1_25);

-  col[7] = _mm_add_epi16(stp1_7, stp1_24);

-  col[8] = _mm_add_epi16(stp1_8, stp1_23);

-  col[9] = _mm_add_epi16(stp1_9, stp1_22);

-  col[10] = _mm_add_epi16(stp1_10, stp1_21);

-  col[11] = _mm_add_epi16(stp1_11, stp1_20);

-  col[12] = _mm_add_epi16(stp1_12, stp1_19);

-  col[13] = _mm_add_epi16(stp1_13, stp1_18);

-  col[14] = _mm_add_epi16(stp1_14, stp1_17);

-  col[15] = _mm_add_epi16(stp1_15, stp1_16);

-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);

-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);

-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);

-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);

-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);

-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);

-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);

-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);

-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);

-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);

-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);

-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);

-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);

-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);

-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);

-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);

-  for (i = 0; i < 4; i++) {

-    int j;

-    const __m128i zero = _mm_setzero_si128();

-    // Transpose 32x8 block to 8x32 block

-    array_transpose_8x8(col + i * 8, in);

-    IDCT32_34

-    // 2_D: Calculate the results and store them to destination.

-    in[0] = _mm_add_epi16(stp1_0, stp1_31);

-    in[1] = _mm_add_epi16(stp1_1, stp1_30);

-    in[2] = _mm_add_epi16(stp1_2, stp1_29);

-    in[3] = _mm_add_epi16(stp1_3, stp1_28);

-    in[4] = _mm_add_epi16(stp1_4, stp1_27);

-    in[5] = _mm_add_epi16(stp1_5, stp1_26);

-    in[6] = _mm_add_epi16(stp1_6, stp1_25);

-    in[7] = _mm_add_epi16(stp1_7, stp1_24);

-    in[8] = _mm_add_epi16(stp1_8, stp1_23);

-    in[9] = _mm_add_epi16(stp1_9, stp1_22);

-    in[10] = _mm_add_epi16(stp1_10, stp1_21);

-    in[11] = _mm_add_epi16(stp1_11, stp1_20);

-    in[12] = _mm_add_epi16(stp1_12, stp1_19);

-    in[13] = _mm_add_epi16(stp1_13, stp1_18);

-    in[14] = _mm_add_epi16(stp1_14, stp1_17);

-    in[15] = _mm_add_epi16(stp1_15, stp1_16);

-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);

-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);

-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);

-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);

-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);

-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);

-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);

-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);

-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);

-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);

-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);

-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);

-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);

-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);

-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);

-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);

-    for (j = 0; j < 32; ++j) {

-      // Final rounding and shift

-      in[j] = _mm_adds_epi16(in[j], final_rounding);

-      in[j] = _mm_srai_epi16(in[j], 6);

-      RECON_AND_STORE(dest + j * stride, in[j]);

-    }

-    dest += 8;

-  }

-}

-void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,

-                                 int stride) {

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);

-  const __m128i zero = _mm_setzero_si128();

-  // idct constants for each stage

-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);

-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);

-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);

-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);

-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);

-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);

-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);

-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);

-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);

-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);

-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);

-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);

-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);

-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);

-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);

-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);

-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);

-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);

-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);

-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);

-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);

-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);

-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);

-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);

-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);

-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);

-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);

-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);

-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

-  __m128i in[32], col[128], zero_idx[16];

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,

-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,

-          stp1_30, stp1_31;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,

-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,

-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,

-          stp2_30, stp2_31;

-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

-  int i, j, i32;

-  for (i = 0; i < 4; i++) {

-    i32 = (i << 5);

-    // First 1-D idct

-    // Load input data.

-    LOAD_DQCOEFF(in[0], input);

-    LOAD_DQCOEFF(in[8], input);

-    LOAD_DQCOEFF(in[16], input);

-    LOAD_DQCOEFF(in[24], input);

-    LOAD_DQCOEFF(in[1], input);

-    LOAD_DQCOEFF(in[9], input);

-    LOAD_DQCOEFF(in[17], input);

-    LOAD_DQCOEFF(in[25], input);

-    LOAD_DQCOEFF(in[2], input);

-    LOAD_DQCOEFF(in[10], input);

-    LOAD_DQCOEFF(in[18], input);

-    LOAD_DQCOEFF(in[26], input);

-    LOAD_DQCOEFF(in[3], input);

-    LOAD_DQCOEFF(in[11], input);

-    LOAD_DQCOEFF(in[19], input);

-    LOAD_DQCOEFF(in[27], input);

-    LOAD_DQCOEFF(in[4], input);

-    LOAD_DQCOEFF(in[12], input);

-    LOAD_DQCOEFF(in[20], input);

-    LOAD_DQCOEFF(in[28], input);

-    LOAD_DQCOEFF(in[5], input);

-    LOAD_DQCOEFF(in[13], input);

-    LOAD_DQCOEFF(in[21], input);

-    LOAD_DQCOEFF(in[29], input);

-    LOAD_DQCOEFF(in[6], input);

-    LOAD_DQCOEFF(in[14], input);

-    LOAD_DQCOEFF(in[22], input);

-    LOAD_DQCOEFF(in[30], input);

-    LOAD_DQCOEFF(in[7], input);

-    LOAD_DQCOEFF(in[15], input);

-    LOAD_DQCOEFF(in[23], input);

-    LOAD_DQCOEFF(in[31], input);

-    // checking if all entries are zero

-    zero_idx[0] = _mm_or_si128(in[0], in[1]);

-    zero_idx[1] = _mm_or_si128(in[2], in[3]);

-    zero_idx[2] = _mm_or_si128(in[4], in[5]);

-    zero_idx[3] = _mm_or_si128(in[6], in[7]);

-    zero_idx[4] = _mm_or_si128(in[8], in[9]);

-    zero_idx[5] = _mm_or_si128(in[10], in[11]);

-    zero_idx[6] = _mm_or_si128(in[12], in[13]);

-    zero_idx[7] = _mm_or_si128(in[14], in[15]);

-    zero_idx[8] = _mm_or_si128(in[16], in[17]);

-    zero_idx[9] = _mm_or_si128(in[18], in[19]);

-    zero_idx[10] = _mm_or_si128(in[20], in[21]);

-    zero_idx[11] = _mm_or_si128(in[22], in[23]);

-    zero_idx[12] = _mm_or_si128(in[24], in[25]);

-    zero_idx[13] = _mm_or_si128(in[26], in[27]);

-    zero_idx[14] = _mm_or_si128(in[28], in[29]);

-    zero_idx[15] = _mm_or_si128(in[30], in[31]);

-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);

-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);

-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);

-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);

-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);

-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);

-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);

-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);

-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);

-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);

-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);

-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);

-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);

-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);

-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);

-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {

-      col[i32 + 0] = _mm_setzero_si128();

-      col[i32 + 1] = _mm_setzero_si128();

-      col[i32 + 2] = _mm_setzero_si128();

-      col[i32 + 3] = _mm_setzero_si128();

-      col[i32 + 4] = _mm_setzero_si128();

-      col[i32 + 5] = _mm_setzero_si128();

-      col[i32 + 6] = _mm_setzero_si128();

-      col[i32 + 7] = _mm_setzero_si128();

-      col[i32 + 8] = _mm_setzero_si128();

-      col[i32 + 9] = _mm_setzero_si128();

-      col[i32 + 10] = _mm_setzero_si128();

-      col[i32 + 11] = _mm_setzero_si128();

-      col[i32 + 12] = _mm_setzero_si128();

-      col[i32 + 13] = _mm_setzero_si128();

-      col[i32 + 14] = _mm_setzero_si128();

-      col[i32 + 15] = _mm_setzero_si128();

-      col[i32 + 16] = _mm_setzero_si128();

-      col[i32 + 17] = _mm_setzero_si128();

-      col[i32 + 18] = _mm_setzero_si128();

-      col[i32 + 19] = _mm_setzero_si128();

-      col[i32 + 20] = _mm_setzero_si128();

-      col[i32 + 21] = _mm_setzero_si128();

-      col[i32 + 22] = _mm_setzero_si128();

-      col[i32 + 23] = _mm_setzero_si128();

-      col[i32 + 24] = _mm_setzero_si128();

-      col[i32 + 25] = _mm_setzero_si128();

-      col[i32 + 26] = _mm_setzero_si128();

-      col[i32 + 27] = _mm_setzero_si128();

-      col[i32 + 28] = _mm_setzero_si128();

-      col[i32 + 29] = _mm_setzero_si128();

-      col[i32 + 30] = _mm_setzero_si128();

-      col[i32 + 31] = _mm_setzero_si128();

-      continue;

-    }

-    // Transpose 32x8 block to 8x32 block

-    array_transpose_8x8(in, in);

-    array_transpose_8x8(in + 8, in + 8);

-    array_transpose_8x8(in + 16, in + 16);

-    array_transpose_8x8(in + 24, in + 24);

-    IDCT32

-    // 1_D: Store 32 intermediate results for each 8x32 block.

-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);

-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);

-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);

-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);

-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);

-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);

-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);

-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);

-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);

-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);

-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);

-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);

-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);

-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);

-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);

-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);

-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);

-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);

-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);

-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);

-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);

-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);

-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);

-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);

-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);

-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);

-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);

-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);

-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);

-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);

-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);

-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);

-  }

-  for (i = 0; i < 4; i++) {

-    // Second 1-D idct

-    j = i << 3;

-    // Transpose 32x8 block to 8x32 block

-    array_transpose_8x8(col + j, in);

-    array_transpose_8x8(col + j + 32, in + 8);

-    array_transpose_8x8(col + j + 64, in + 16);

-    array_transpose_8x8(col + j + 96, in + 24);

-    IDCT32

-    // 2_D: Calculate the results and store them to destination.

-    in[0] = _mm_add_epi16(stp1_0, stp1_31);

-    in[1] = _mm_add_epi16(stp1_1, stp1_30);

-    in[2] = _mm_add_epi16(stp1_2, stp1_29);

-    in[3] = _mm_add_epi16(stp1_3, stp1_28);

-    in[4] = _mm_add_epi16(stp1_4, stp1_27);

-    in[5] = _mm_add_epi16(stp1_5, stp1_26);

-    in[6] = _mm_add_epi16(stp1_6, stp1_25);

-    in[7] = _mm_add_epi16(stp1_7, stp1_24);

-    in[8] = _mm_add_epi16(stp1_8, stp1_23);

-    in[9] = _mm_add_epi16(stp1_9, stp1_22);

-    in[10] = _mm_add_epi16(stp1_10, stp1_21);

-    in[11] = _mm_add_epi16(stp1_11, stp1_20);

-    in[12] = _mm_add_epi16(stp1_12, stp1_19);

-    in[13] = _mm_add_epi16(stp1_13, stp1_18);

-    in[14] = _mm_add_epi16(stp1_14, stp1_17);

-    in[15] = _mm_add_epi16(stp1_15, stp1_16);

-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);

-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);

-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);

-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);

-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);

-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);

-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);

-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);

-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);

-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);

-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);

-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);

-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);

-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);

-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);

-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);

-    for (j = 0; j < 32; ++j) {

-      // Final rounding and shift

-      in[j] = _mm_adds_epi16(in[j], final_rounding);

-      in[j] = _mm_srai_epi16(in[j], 6);

-      RECON_AND_STORE(dest + j * stride, in[j]);

-    }

-    dest += 8;

-  }

-}

-void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

-  __m128i dc_value;

-  const __m128i zero = _mm_setzero_si128();

-  int a, i;

-  a = dct_const_round_shift(input[0] * cospi_16_64);

-  a = dct_const_round_shift(a * cospi_16_64);

-  a = ROUND_POWER_OF_TWO(a, 6);

-  dc_value = _mm_set1_epi16(a);

-  for (i = 0; i < 4; ++i) {

-    int j;

-    for (j = 0; j < 32; ++j) {

-      RECON_AND_STORE(dest + j * stride, dc_value);

-    }

-    dest += 8;

-  }

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {

-  __m128i ubounded, retval;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);

-  ubounded = _mm_cmpgt_epi16(value, max);

-  retval = _mm_andnot_si128(ubounded, value);

-  ubounded = _mm_and_si128(ubounded, max);

-  retval = _mm_or_si128(retval, ubounded);

-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));

-  return retval;

-}

-void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                    int stride, int bd) {

-  tran_low_t out[4 * 4];

-  tran_low_t *outptr = out;

-  int i, j;

-  __m128i inptr[4];

-  __m128i sign_bits[2];

-  __m128i temp_mm, min_input, max_input;

-  int test;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  int optimised_cols = 0;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i eight = _mm_set1_epi16(8);

-  const __m128i max = _mm_set1_epi16(12043);

-  const __m128i min = _mm_set1_epi16(-12043);

-  // Load input into __m128i

-  inptr[0] = _mm_loadu_si128((const __m128i *)input);

-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));

-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));

-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));

-  // Pack to 16 bits

-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);

-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp_mm = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp_mm);

-  if (!test) {

-    // Do the row transform

-    idct4_sse2(inptr);

-    // Check the min & max values

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp_mm = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp_mm);

-    if (test) {

-      transpose_4x4(inptr);

-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);

-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);

-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);

-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);

-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);

-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);

-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);

-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);

-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);

-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vp9_highbd_idct4(input, outptr, bd);

-      input += 4;

-      outptr += 4;

-    }

-  }

-  if (optimised_cols) {

-    idct4_sse2(inptr);

-    // Final round and shift

-    inptr[0] = _mm_add_epi16(inptr[0], eight);

-    inptr[1] = _mm_add_epi16(inptr[1], eight);

-    inptr[0] = _mm_srai_epi16(inptr[0], 4);

-    inptr[1] = _mm_srai_epi16(inptr[1], 4);

-    // Reconstruction and Store

-    {

-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);

-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));

-      d0 = _mm_unpacklo_epi64(

-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));

-      d2 = _mm_unpacklo_epi64(

-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));

-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);

-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);

-      // store input0

-      _mm_storel_epi64((__m128i *)dest, d0);

-      // store input1

-      d0 = _mm_srli_si128(d0, 8);

-      _mm_storel_epi64((__m128i *)(dest + stride), d0);

-      // store input2

-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);

-      // store input3

-      d2 = _mm_srli_si128(d2, 8);

-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[4], temp_out[4];

-    // Columns

-    for (i = 0; i < 4; ++i) {

-      for (j = 0; j < 4; ++j)

-        temp_in[j] = out[j * 4 + i];

-      vp9_highbd_idct4(temp_in, temp_out, bd);

-      for (j = 0; j < 4; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

-      }

-    }

-  }

-}

-void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                    int stride, int bd) {

-  tran_low_t out[8 * 8];

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[8];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i sixteen = _mm_set1_epi16(16);

-  const __m128i max = _mm_set1_epi16(6201);

-  const __m128i min = _mm_set1_epi16(-6201);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 8; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 8; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct8_sse2(inptr);

-    // Find the min & max for the column transform

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 8; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      array_transpose_8x8(inptr, inptr);

-      for (i = 0; i < 8; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 8; ++i) {

-      vp9_highbd_idct8(input, outptr, bd);

-      input += 8;

-      outptr += 8;

-    }

-  }

-  if (optimised_cols) {

-    idct8_sse2(inptr);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[8];

-      for (i = 0; i < 8; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

-        inptr[i] = _mm_srai_epi16(inptr[i], 5);

-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[8], temp_out[8];

-    for (i = 0; i < 8; ++i) {

-      for (j = 0; j < 8; ++j)

-        temp_in[j] = out[j * 8 + i];

-      vp9_highbd_idct8(temp_in, temp_out, bd);

-      for (j = 0; j < 8; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-      }

-    }

-  }

-}

-void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                    int stride, int bd) {

-  tran_low_t out[8 * 8] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[8];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i sixteen = _mm_set1_epi16(16);

-  const __m128i max = _mm_set1_epi16(6201);

-  const __m128i min = _mm_set1_epi16(-6201);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 8; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  // only first 4 row has non-zero coefs

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 4; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct8_sse2(inptr);

-    // Find the min & max for the column transform

-    // N.B. Only first 4 cols contain non-zero coeffs

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 8; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      // Use fact only first 4 rows contain non-zero coeffs

-      array_transpose_4X8(inptr, inptr);

-      for (i = 0; i < 4; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vp9_highbd_idct8(input, outptr, bd);

-      input += 8;

-      outptr += 8;

-    }

-  }

-  if (optimised_cols) {

-    idct8_sse2(inptr);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[8];

-      for (i = 0; i < 8; i++) {

-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

-        inptr[i] = _mm_srai_epi16(inptr[i], 5);

-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[8], temp_out[8];

-    for (i = 0; i < 8; ++i) {

-      for (j = 0; j < 8; ++j)

-        temp_in[j] = out[j * 8 + i];

-      vp9_highbd_idct8(temp_in, temp_out, bd);

-      for (j = 0; j < 8; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

-      }

-    }

-  }

-}

-void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                       int stride, int bd) {

-  tran_low_t out[16 * 16];

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[32];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_set1_epi16(32);

-  const __m128i max = _mm_set1_epi16(3155);

-  const __m128i min = _mm_set1_epi16(-3155);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 16; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 32; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform

-    idct16_sse2(inptr, inptr + 16);

-    // Find the min & max for the column transform

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 32; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      array_transpose_16x16(inptr, inptr + 16);

-      for (i = 0; i < 16; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 16; ++i) {

-      vp9_highbd_idct16(input, outptr, bd);

-      input += 16;

-      outptr += 16;

-    }

-  }

-  if (optimised_cols) {

-    idct16_sse2(inptr, inptr + 16);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[2];

-      for (i = 0; i < 16; i++) {

-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);

-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);

-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));

-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);

-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);

-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);

-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);

-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[16], temp_out[16];

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j * 16 + i];

-      vp9_highbd_idct16(temp_in, temp_out, bd);

-      for (j = 0; j < 16; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-      }

-    }

-  }

-}

-void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,

-                                      int stride, int bd) {

-  tran_low_t out[16 * 16] = { 0 };

-  tran_low_t *outptr = out;

-  int i, j, test;

-  __m128i inptr[32];

-  __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_set1_epi16(32);

-  const __m128i max = _mm_set1_epi16(3155);

-  const __m128i min = _mm_set1_epi16(-3155);

-  int optimised_cols = 0;

-  // Load input into __m128i & pack to 16 bits

-  for (i = 0; i < 16; i++) {

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

-    inptr[i] = _mm_packs_epi32(temp1, temp2);

-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

-  }

-  // Find the min & max for the row transform

-  // Since all non-zero dct coefficients are in upper-left 4x4 area,

-  // we only need to consider first 4 rows here.

-  max_input = _mm_max_epi16(inptr[0], inptr[1]);

-  min_input = _mm_min_epi16(inptr[0], inptr[1]);

-  for (i = 2; i < 4; i++) {

-    max_input = _mm_max_epi16(max_input, inptr[i]);

-    min_input = _mm_min_epi16(min_input, inptr[i]);

-  }

-  max_input = _mm_cmpgt_epi16(max_input, max);

-  min_input = _mm_cmplt_epi16(min_input, min);

-  temp1 = _mm_or_si128(max_input, min_input);

-  test = _mm_movemask_epi8(temp1);

-  if (!test) {

-    // Do the row transform (N.B. This transposes inptr)

-    idct16_sse2(inptr, inptr + 16);

-    // Find the min & max for the column transform

-    // N.B. Only first 4 cols contain non-zero coeffs

-    max_input = _mm_max_epi16(inptr[0], inptr[1]);

-    min_input = _mm_min_epi16(inptr[0], inptr[1]);

-    for (i = 2; i < 16; i++) {

-      max_input = _mm_max_epi16(max_input, inptr[i]);

-      min_input = _mm_min_epi16(min_input, inptr[i]);

-    }

-    max_input = _mm_cmpgt_epi16(max_input, max);

-    min_input = _mm_cmplt_epi16(min_input, min);

-    temp1 = _mm_or_si128(max_input, min_input);

-    test = _mm_movemask_epi8(temp1);

-    if (test) {

-      // Use fact only first 4 rows contain non-zero coeffs

-      array_transpose_8x8(inptr, inptr);

-      array_transpose_8x8(inptr + 8, inptr + 16);

-      for (i = 0; i < 4; i++) {

-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

-      }

-    } else {

-      // Set to use the optimised transform for the column

-      optimised_cols = 1;

-    }

-  } else {

-    // Run the un-optimised row transform

-    for (i = 0; i < 4; ++i) {

-      vp9_highbd_idct16(input, outptr, bd);

-      input += 16;

-      outptr += 16;

-    }

-  }

-  if (optimised_cols) {

-    idct16_sse2(inptr, inptr + 16);

-    // Final round & shift and Reconstruction and Store

-    {

-      __m128i d[2];

-      for (i = 0; i < 16; i++) {

-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);

-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);

-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));

-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);

-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);

-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);

-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);

-        // Store

-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);

-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);

-      }

-    }

-  } else {

-    // Run the un-optimised column transform

-    tran_low_t temp_in[16], temp_out[16];

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j * 16 + i];

-      vp9_highbd_idct16(temp_in, temp_out, bd);

-      for (j = 0; j < 16; ++j) {

-        dest[j * stride + i] = highbd_clip_pixel_add(

-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

-      }

-    }

-  }

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/common/x86/vp9_idct_intrin_sse2.h

+++ /dev/null

@@ -1,174 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h>

-#include <emmintrin.h>  // SSE2

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-#include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_idct.h"

-// perform 8x8 transpose

-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {

-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);

-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);

-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);

-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);

-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);

-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);

-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);

-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);

-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);

-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);

-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);

-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);

-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);

-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);

-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);

-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);

-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);

-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);

-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);

-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);

-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);

-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);

-}

-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \

-  {                                                     \

-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

-                                                        \

-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \

-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \

-  }

-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {

-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);

-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);

-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);

-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);

-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);

-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);

-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);

-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);

-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);

-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);

-}

-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {

-  __m128i tbuf[8];

-  array_transpose_8x8(res0, res0);

-  array_transpose_8x8(res1, tbuf);

-  array_transpose_8x8(res0 + 8, res1);

-  array_transpose_8x8(res1 + 8, res1 + 8);

-  res0[8] = tbuf[0];

-  res0[9] = tbuf[1];

-  res0[10] = tbuf[2];

-  res0[11] = tbuf[3];

-  res0[12] = tbuf[4];

-  res0[13] = tbuf[5];

-  res0[14] = tbuf[6];

-  res0[15] = tbuf[7];

-}

-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {

-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));

-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));

-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));

-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));

-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));

-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));

-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));

-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));

-  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));

-  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));

-  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));

-  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));

-  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));

-  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));

-  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));

-  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));

-}

-#define RECON_AND_STORE(dest, in_x) \

-  {                                                     \

-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \

-      d0 = _mm_unpacklo_epi8(d0, zero); \

-      d0 = _mm_add_epi16(in_x, d0); \

-      d0 = _mm_packus_epi16(d0, d0); \

-      _mm_storel_epi64((__m128i *)(dest), d0); \

-  }

-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {

-  const __m128i final_rounding = _mm_set1_epi16(1<<5);

-  const __m128i zero = _mm_setzero_si128();

-  // Final rounding and shift

-  in[0] = _mm_adds_epi16(in[0], final_rounding);

-  in[1] = _mm_adds_epi16(in[1], final_rounding);

-  in[2] = _mm_adds_epi16(in[2], final_rounding);

-  in[3] = _mm_adds_epi16(in[3], final_rounding);

-  in[4] = _mm_adds_epi16(in[4], final_rounding);

-  in[5] = _mm_adds_epi16(in[5], final_rounding);

-  in[6] = _mm_adds_epi16(in[6], final_rounding);

-  in[7] = _mm_adds_epi16(in[7], final_rounding);

-  in[8] = _mm_adds_epi16(in[8], final_rounding);

-  in[9] = _mm_adds_epi16(in[9], final_rounding);

-  in[10] = _mm_adds_epi16(in[10], final_rounding);

-  in[11] = _mm_adds_epi16(in[11], final_rounding);

-  in[12] = _mm_adds_epi16(in[12], final_rounding);

-  in[13] = _mm_adds_epi16(in[13], final_rounding);

-  in[14] = _mm_adds_epi16(in[14], final_rounding);

-  in[15] = _mm_adds_epi16(in[15], final_rounding);

-  in[0] = _mm_srai_epi16(in[0], 6);

-  in[1] = _mm_srai_epi16(in[1], 6);

-  in[2] = _mm_srai_epi16(in[2], 6);

-  in[3] = _mm_srai_epi16(in[3], 6);

-  in[4] = _mm_srai_epi16(in[4], 6);

-  in[5] = _mm_srai_epi16(in[5], 6);

-  in[6] = _mm_srai_epi16(in[6], 6);

-  in[7] = _mm_srai_epi16(in[7], 6);

-  in[8] = _mm_srai_epi16(in[8], 6);

-  in[9] = _mm_srai_epi16(in[9], 6);

-  in[10] = _mm_srai_epi16(in[10], 6);

-  in[11] = _mm_srai_epi16(in[11], 6);

-  in[12] = _mm_srai_epi16(in[12], 6);

-  in[13] = _mm_srai_epi16(in[13], 6);

-  in[14] = _mm_srai_epi16(in[14], 6);

-  in[15] = _mm_srai_epi16(in[15], 6);

-  RECON_AND_STORE(dest +  0 * stride, in[0]);

-  RECON_AND_STORE(dest +  1 * stride, in[1]);

-  RECON_AND_STORE(dest +  2 * stride, in[2]);

-  RECON_AND_STORE(dest +  3 * stride, in[3]);

-  RECON_AND_STORE(dest +  4 * stride, in[4]);

-  RECON_AND_STORE(dest +  5 * stride, in[5]);

-  RECON_AND_STORE(dest +  6 * stride, in[6]);

-  RECON_AND_STORE(dest +  7 * stride, in[7]);

-  RECON_AND_STORE(dest +  8 * stride, in[8]);

-  RECON_AND_STORE(dest +  9 * stride, in[9]);

-  RECON_AND_STORE(dest + 10 * stride, in[10]);

-  RECON_AND_STORE(dest + 11 * stride, in[11]);

-  RECON_AND_STORE(dest + 12 * stride, in[12]);

-  RECON_AND_STORE(dest + 13 * stride, in[13]);

-  RECON_AND_STORE(dest + 14 * stride, in[14]);

-  RECON_AND_STORE(dest + 15 * stride, in[15]);

-}

--- a/vp9/common/x86/vp9_idct_sse2.asm

+++ /dev/null

@@ -1,102 +1,0 @@

-;

-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION .text

-%macro REORDER_INPUTS 0

-  ; a c d b  to  a b c d

-  SWAP 1, 3, 2

-%endmacro

-%macro TRANSFORM_COLS 0

-  ; input:

-  ; m0 a

-  ; m1 b

-  ; m2 c

-  ; m3 d

-  paddw           m0,        m2

-  psubw           m3,        m1

-  ; wide subtract

-  punpcklwd       m4,        m0

-  punpcklwd       m5,        m3

-  psrad           m4,        16

-  psrad           m5,        16

-  psubd           m4,        m5

-  psrad           m4,        1

-  packssdw        m4,        m4             ; e

-  psubw           m5,        m4,        m1  ; b

-  psubw           m4,        m2             ; c

-  psubw           m0,        m5

-  paddw           m3,        m4

-                                ; m0 a

-  SWAP            1,         5  ; m1 b

-  SWAP            2,         4  ; m2 c

-                                ; m3 d

-%endmacro

-%macro TRANSPOSE_4X4 0

-  punpcklwd       m0,        m2

-  punpcklwd       m1,        m3

-  mova            m2,        m0

-  punpcklwd       m0,        m1

-  punpckhwd       m2,        m1

-  pshufd          m1,        m0, 0x0e

-  pshufd          m3,        m2, 0x0e

-%endmacro

-; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3

-%macro TRANSPOSE_4X4_WIDE 0

-  mova            m3, m0

-  punpcklwd       m0, m1

-  punpckhwd       m3, m1

-  mova            m2, m0

-  punpcklwd       m0, m3

-  punpckhwd       m2, m3

-  pshufd          m1, m0, 0x0e

-  pshufd          m3, m2, 0x0e

-%endmacro

-%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero

-  movq            m%3,       [outputq]

-  movq            m%4,       [outputq + strideq]

-  punpcklbw       m%3,       m%5

-  punpcklbw       m%4,       m%5

-  paddw           m%1,       m%3

-  paddw           m%2,       m%4

-  packuswb        m%1,       m%5

-  packuswb        m%2,       m%5

-  movd            [outputq], m%1

-  movd            [outputq + strideq], m%2

-%endmacro

-INIT_XMM sse2

-cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride

-  mova            m0,        [inputq +  0]

-  mova            m1,        [inputq + 16]

-  psraw           m0,        2

-  psraw           m1,        2

-  TRANSPOSE_4X4_WIDE

-  REORDER_INPUTS

-  TRANSFORM_COLS

-  TRANSPOSE_4X4

-  REORDER_INPUTS

-  TRANSFORM_COLS

-  pxor            m4, m4

-  ADD_STORE_4P_2X  0, 1, 5, 6, 4

-  lea             outputq, [outputq + 2 * strideq]

-  ADD_STORE_4P_2X  2, 3, 5, 6, 4

-  RET

--- a/vp9/common/x86/vp9_idct_ssse3_x86_64.asm

+++ /dev/null

@@ -1,300 +1,0 @@

-;

-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-; This file provides SSSE3 version of the inverse transformation. Part

-; of the functions are originally derived from the ffmpeg project.

-; Note that the current version applies to x86 64-bit only.

-SECTION_RODATA

-pw_11585x2: times 8 dw 23170

-pd_8192:    times 4 dd 8192

-pw_16:      times 8 dw 16

-%macro TRANSFORM_COEFFS 2

-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2

-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1

-%endmacro

-TRANSFORM_COEFFS    6270, 15137

-TRANSFORM_COEFFS    3196, 16069

-TRANSFORM_COEFFS   13623,  9102

-%macro PAIR_PP_COEFFS 2

-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2

-%endmacro

-%macro PAIR_MP_COEFFS 2

-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2

-%endmacro

-%macro PAIR_MM_COEFFS 2

-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2

-%endmacro

-PAIR_PP_COEFFS     30274, 12540

-PAIR_PP_COEFFS      6392, 32138

-PAIR_MP_COEFFS     18204, 27246

-PAIR_PP_COEFFS     12540, 12540

-PAIR_PP_COEFFS     30274, 30274

-PAIR_PP_COEFFS      6392,  6392

-PAIR_PP_COEFFS     32138, 32138

-PAIR_MM_COEFFS     18204, 18204

-PAIR_PP_COEFFS     27246, 27246

-SECTION .text

-%if ARCH_X86_64

-%macro SUM_SUB 3

-  psubw  m%3, m%1, m%2

-  paddw  m%1, m%2

-  SWAP    %2, %3

-%endmacro

-; butterfly operation

-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2

-  pmaddwd            m%1, m%3, %5

-  pmaddwd            m%2, m%3, %6

-  paddd              m%1,  %4

-  paddd              m%2,  %4

-  psrad              m%1,  14

-  psrad              m%2,  14

-%endmacro

-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2

-  punpckhwd          m%6, m%2, m%1

-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]

-  punpcklwd          m%2, m%1

-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]

-  packssdw           m%1, m%7

-  packssdw           m%2, m%6

-%endmacro

-; matrix transpose

-%macro INTERLEAVE_2X 4

-  punpckh%1          m%4, m%2, m%3

-  punpckl%1          m%2, m%3

-  SWAP               %3,  %4

-%endmacro

-%macro TRANSPOSE8X8 9

-  INTERLEAVE_2X  wd, %1, %2, %9

-  INTERLEAVE_2X  wd, %3, %4, %9

-  INTERLEAVE_2X  wd, %5, %6, %9

-  INTERLEAVE_2X  wd, %7, %8, %9

-  INTERLEAVE_2X  dq, %1, %3, %9

-  INTERLEAVE_2X  dq, %2, %4, %9

-  INTERLEAVE_2X  dq, %5, %7, %9

-  INTERLEAVE_2X  dq, %6, %8, %9

-  INTERLEAVE_2X  qdq, %1, %5, %9

-  INTERLEAVE_2X  qdq, %3, %7, %9

-  INTERLEAVE_2X  qdq, %2, %6, %9

-  INTERLEAVE_2X  qdq, %4, %8, %9

-  SWAP  %2, %5

-  SWAP  %4, %7

-%endmacro

-%macro IDCT8_1D 0

-  SUM_SUB          0,    4,    9

-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10

-  pmulhrsw        m0,  m12

-  pmulhrsw        m4,  m12

-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10

-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10

-  SUM_SUB          1,    5,    9

-  SUM_SUB          7,    3,    9

-  SUM_SUB          0,    6,    9

-  SUM_SUB          4,    2,    9

-  SUM_SUB          3,    5,    9

-  pmulhrsw        m3,  m12

-  pmulhrsw        m5,  m12

-  SUM_SUB          0,    7,    9

-  SUM_SUB          4,    3,    9

-  SUM_SUB          2,    5,    9

-  SUM_SUB          6,    1,    9

-  SWAP             3,    6

-  SWAP             1,    4

-%endmacro

-; This macro handles 8 pixels per line

-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero

-  paddw           m%1, m11

-  paddw           m%2, m11

-  psraw           m%1, 5

-  psraw           m%2, 5

-  movh            m%3, [outputq]

-  movh            m%4, [outputq + strideq]

-  punpcklbw       m%3, m%5

-  punpcklbw       m%4, m%5

-  paddw           m%3, m%1

-  paddw           m%4, m%2

-  packuswb        m%3, m%5

-  packuswb        m%4, m%5

-  movh               [outputq], m%3

-  movh     [outputq + strideq], m%4

-%endmacro

-INIT_XMM ssse3

-; full inverse 8x8 2D-DCT transform

-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride

-  mova     m8, [pd_8192]

-  mova    m11, [pw_16]

-  mova    m12, [pw_11585x2]

-  lea      r3, [2 * strideq]

-  mova     m0, [inputq +   0]

-  mova     m1, [inputq +  16]

-  mova     m2, [inputq +  32]

-  mova     m3, [inputq +  48]

-  mova     m4, [inputq +  64]

-  mova     m5, [inputq +  80]

-  mova     m6, [inputq +  96]

-  mova     m7, [inputq + 112]

-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

-  IDCT8_1D

-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

-  IDCT8_1D

-  pxor    m12, m12

-  ADD_STORE_8P_2X  0, 1, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  2, 3, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  4, 5, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  6, 7, 9, 10, 12

-  RET

-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero

-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride

-  mova       m8, [pd_8192]

-  mova      m11, [pw_16]

-  mova      m12, [pw_11585x2]

-  lea        r3, [2 * strideq]

-  mova       m0, [inputq +  0]

-  mova       m1, [inputq + 16]

-  mova       m2, [inputq + 32]

-  mova       m3, [inputq + 48]

-  punpcklwd  m0, m1

-  punpcklwd  m2, m3

-  punpckhdq  m9, m0, m2

-  punpckldq  m0, m2

-  SWAP       2, 9

-  ; m0 -> [0], [0]

-  ; m1 -> [1], [1]

-  ; m2 -> [2], [2]

-  ; m3 -> [3], [3]

-  punpckhqdq m10, m0, m0

-  punpcklqdq m0,  m0

-  punpckhqdq m9,  m2, m2

-  punpcklqdq m2,  m2

-  SWAP       1, 10

-  SWAP       3,  9

-  pmulhrsw   m0, m12

-  pmulhrsw   m2, [dpw_30274_12540]

-  pmulhrsw   m1, [dpw_6392_32138]

-  pmulhrsw   m3, [dpw_m18204_27246]

-  SUM_SUB    0, 2, 9

-  SUM_SUB    1, 3, 9

-  punpcklqdq m9, m3, m3

-  punpckhqdq m5, m3, m9

-  SUM_SUB    3, 5, 9

-  punpckhqdq m5, m3

-  pmulhrsw   m5, m12

-  punpckhqdq m9, m1, m5

-  punpcklqdq m1, m5

-  SWAP       5, 9

-  SUM_SUB    0, 5, 9

-  SUM_SUB    2, 1, 9

-  punpckhqdq m3, m0, m0

-  punpckhqdq m4, m1, m1

-  punpckhqdq m6, m5, m5

-  punpckhqdq m7, m2, m2

-  punpcklwd  m0, m3

-  punpcklwd  m7, m2

-  punpcklwd  m1, m4

-  punpcklwd  m6, m5

-  punpckhdq  m4, m0, m7

-  punpckldq  m0, m7

-  punpckhdq  m10, m1, m6

-  punpckldq  m5, m1, m6

-  punpckhqdq m1, m0, m5

-  punpcklqdq m0, m5

-  punpckhqdq m3, m4, m10

-  punpcklqdq m2, m4, m10

-  pmulhrsw   m0, m12

-  pmulhrsw   m6, m2, [dpw_30274_30274]

-  pmulhrsw   m4, m2, [dpw_12540_12540]

-  pmulhrsw   m7, m1, [dpw_32138_32138]

-  pmulhrsw   m1, [dpw_6392_6392]

-  pmulhrsw   m5, m3, [dpw_m18204_m18204]

-  pmulhrsw   m3, [dpw_27246_27246]

-  mova       m2, m0

-  SUM_SUB    0, 6, 9

-  SUM_SUB    2, 4, 9

-  SUM_SUB    1, 5, 9

-  SUM_SUB    7, 3, 9

-  SUM_SUB    3, 5, 9

-  pmulhrsw   m3, m12

-  pmulhrsw   m5, m12

-  SUM_SUB    0, 7, 9

-  SUM_SUB    2, 3, 9

-  SUM_SUB    4, 5, 9

-  SUM_SUB    6, 1, 9

-  SWAP       3, 6

-  SWAP       1, 2

-  SWAP       2, 4

-  pxor    m12, m12

-  ADD_STORE_8P_2X  0, 1, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  2, 3, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  4, 5, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  6, 7, 9, 10, 12

-  RET

-%endif

--- a/vp9/encoder/x86/vp9_dct_ssse3.c

+++ b/vp9/encoder/x86/vp9_dct_ssse3.c

@@ -17,7 +17,7 @@

 #include <tmmintrin.h>  // SSSE3

 #include "./vp9_rtcd.h"

-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

 void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -66,7 +66,6 @@

 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c

 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h

 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm

 ifeq ($(CONFIG_VP9_POSTPROC),yes)

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm

@@ -96,14 +95,7 @@

 endif

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h

-ifeq ($(ARCH_X86_64), yes)

-ifeq ($(CONFIG_USE_X86INC),yes)

-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm

-endif

-endif

 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

@@ -110,31 +102,5 @@

 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c

 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c

 endif

-# neon with assembly and intrinsics implementations. If both are available

-# prefer assembly.

-ifeq ($(HAVE_NEON_ASM), yes)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon_asm$(ASM)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)

-else

-ifeq ($(HAVE_NEON), yes)

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c

-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c

-endif  # HAVE_NEON

-endif  # HAVE_NEON_ASM

 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))

--- /dev/null

+++ b/vpx_dsp/arm/idct16x16_1_add_neon.asm

@@ -1,0 +1,198 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |vp9_idct16x16_1_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,

+;                                    int dest_stride)

+;

+; r0  int16_t input

+; r1  uint8_t *dest

+; r2  int dest_stride)

+|vp9_idct16x16_1_add_neon| PROC

+    ldrsh            r0, [r0]

+    ; generate cospi_16_64 = 11585

+    mov              r12, #0x2d00

+    add              r12, #0x41

+    ; out = dct_const_round_shift(input[0] * cospi_16_64)

+    mul              r0, r0, r12               ; input[0] * cospi_16_64

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; out = dct_const_round_shift(out * cospi_16_64)

+    mul              r0, r0, r12               ; out * cospi_16_64

+    mov              r12, r1                   ; save dest

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; a1 = ROUND_POWER_OF_TWO(out, 6)

+    add              r0, r0, #32               ; + (1 <<((6) - 1))

+    asr              r0, r0, #6                ; >> 6

+    vdup.s16         q0, r0                    ; duplicate a1

+    mov              r0, #8

+    sub              r2, #8

+    ; load destination data row0 - row3

+    vld1.64          {d2}, [r1], r0

+    vld1.64          {d3}, [r1], r2

+    vld1.64          {d4}, [r1], r0

+    vld1.64          {d5}, [r1], r2

+    vld1.64          {d6}, [r1], r0

+    vld1.64          {d7}, [r1], r2

+    vld1.64          {d16}, [r1], r0

+    vld1.64          {d17}, [r1], r2

+    vaddw.u8         q9, q0, d2                ; dest[x] + a1

+    vaddw.u8         q10, q0, d3               ; dest[x] + a1

+    vaddw.u8         q11, q0, d4               ; dest[x] + a1

+    vaddw.u8         q12, q0, d5               ; dest[x] + a1

+    vqmovun.s16      d2, q9                    ; clip_pixel

+    vqmovun.s16      d3, q10                   ; clip_pixel

+    vqmovun.s16      d30, q11                  ; clip_pixel

+    vqmovun.s16      d31, q12                  ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

+    vaddw.u8         q10, q0, d7                ; dest[x] + a1

+    vaddw.u8         q11, q0, d16               ; dest[x] + a1

+    vaddw.u8         q12, q0, d17               ; dest[x] + a1

+    vqmovun.s16      d2, q9                     ; clip_pixel

+    vqmovun.s16      d3, q10                    ; clip_pixel

+    vqmovun.s16      d30, q11                   ; clip_pixel

+    vqmovun.s16      d31, q12                   ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    ; load destination data row4 - row7

+    vld1.64          {d2}, [r1], r0

+    vld1.64          {d3}, [r1], r2

+    vld1.64          {d4}, [r1], r0

+    vld1.64          {d5}, [r1], r2

+    vld1.64          {d6}, [r1], r0

+    vld1.64          {d7}, [r1], r2

+    vld1.64          {d16}, [r1], r0

+    vld1.64          {d17}, [r1], r2

+    vaddw.u8         q9, q0, d2                ; dest[x] + a1

+    vaddw.u8         q10, q0, d3               ; dest[x] + a1

+    vaddw.u8         q11, q0, d4               ; dest[x] + a1

+    vaddw.u8         q12, q0, d5               ; dest[x] + a1

+    vqmovun.s16      d2, q9                    ; clip_pixel

+    vqmovun.s16      d3, q10                   ; clip_pixel

+    vqmovun.s16      d30, q11                  ; clip_pixel

+    vqmovun.s16      d31, q12                  ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

+    vaddw.u8         q10, q0, d7                ; dest[x] + a1

+    vaddw.u8         q11, q0, d16               ; dest[x] + a1

+    vaddw.u8         q12, q0, d17               ; dest[x] + a1

+    vqmovun.s16      d2, q9                     ; clip_pixel

+    vqmovun.s16      d3, q10                    ; clip_pixel

+    vqmovun.s16      d30, q11                   ; clip_pixel

+    vqmovun.s16      d31, q12                   ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    ; load destination data row8 - row11

+    vld1.64          {d2}, [r1], r0

+    vld1.64          {d3}, [r1], r2

+    vld1.64          {d4}, [r1], r0

+    vld1.64          {d5}, [r1], r2

+    vld1.64          {d6}, [r1], r0

+    vld1.64          {d7}, [r1], r2

+    vld1.64          {d16}, [r1], r0

+    vld1.64          {d17}, [r1], r2

+    vaddw.u8         q9, q0, d2                ; dest[x] + a1

+    vaddw.u8         q10, q0, d3               ; dest[x] + a1

+    vaddw.u8         q11, q0, d4               ; dest[x] + a1

+    vaddw.u8         q12, q0, d5               ; dest[x] + a1

+    vqmovun.s16      d2, q9                    ; clip_pixel

+    vqmovun.s16      d3, q10                   ; clip_pixel

+    vqmovun.s16      d30, q11                  ; clip_pixel

+    vqmovun.s16      d31, q12                  ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

+    vaddw.u8         q10, q0, d7                ; dest[x] + a1

+    vaddw.u8         q11, q0, d16               ; dest[x] + a1

+    vaddw.u8         q12, q0, d17               ; dest[x] + a1

+    vqmovun.s16      d2, q9                     ; clip_pixel

+    vqmovun.s16      d3, q10                    ; clip_pixel

+    vqmovun.s16      d30, q11                   ; clip_pixel

+    vqmovun.s16      d31, q12                   ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    ; load destination data row12 - row15

+    vld1.64          {d2}, [r1], r0

+    vld1.64          {d3}, [r1], r2

+    vld1.64          {d4}, [r1], r0

+    vld1.64          {d5}, [r1], r2

+    vld1.64          {d6}, [r1], r0

+    vld1.64          {d7}, [r1], r2

+    vld1.64          {d16}, [r1], r0

+    vld1.64          {d17}, [r1], r2

+    vaddw.u8         q9, q0, d2                ; dest[x] + a1

+    vaddw.u8         q10, q0, d3               ; dest[x] + a1

+    vaddw.u8         q11, q0, d4               ; dest[x] + a1

+    vaddw.u8         q12, q0, d5               ; dest[x] + a1

+    vqmovun.s16      d2, q9                    ; clip_pixel

+    vqmovun.s16      d3, q10                   ; clip_pixel

+    vqmovun.s16      d30, q11                  ; clip_pixel

+    vqmovun.s16      d31, q12                  ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

+    vaddw.u8         q10, q0, d7                ; dest[x] + a1

+    vaddw.u8         q11, q0, d16               ; dest[x] + a1

+    vaddw.u8         q12, q0, d17               ; dest[x] + a1

+    vqmovun.s16      d2, q9                     ; clip_pixel

+    vqmovun.s16      d3, q10                    ; clip_pixel

+    vqmovun.s16      d30, q11                   ; clip_pixel

+    vqmovun.s16      d31, q12                   ; clip_pixel

+    vst1.64          {d2}, [r12], r0

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r0

+    vst1.64          {d31}, [r12], r2

+    bx               lr

+    ENDP             ; |vp9_idct16x16_1_add_neon|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct16x16_1_add_neon.c

@@ -1,0 +1,61 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_ports/mem.h"

+void vp9_idct16x16_1_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8x8_t d2u8, d3u8, d30u8, d31u8;

+    uint64x1_t d2u64, d3u64, d4u64, d5u64;

+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;

+    int16x8_t q0s16;

+    uint8_t *d1, *d2;

+    int16_t i, j, a1, cospi_16_64 = 11585;

+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+    out = dct_const_round_shift(out * cospi_16_64);

+    a1 = ROUND_POWER_OF_TWO(out, 6);

+    q0s16 = vdupq_n_s16(a1);

+    q0u16 = vreinterpretq_u16_s16(q0s16);

+    for (d1 = d2 = dest, i = 0; i < 4; i++) {

+        for (j = 0; j < 2; j++) {

+            d2u64 = vld1_u64((const uint64_t *)d1);

+            d3u64 = vld1_u64((const uint64_t *)(d1 + 8));

+            d1 += dest_stride;

+            d4u64 = vld1_u64((const uint64_t *)d1);

+            d5u64 = vld1_u64((const uint64_t *)(d1 + 8));

+            d1 += dest_stride;

+            q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));

+            q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));

+            q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));

+            q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));

+            d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+            d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

+            d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

+            d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));

+            d2 += dest_stride;

+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));

+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));

+            d2 += dest_stride;

+        }

+    }

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct16x16_add_neon.asm

@@ -1,0 +1,1179 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_idct16x16_256_add_neon_pass1|

+    EXPORT  |vp9_idct16x16_256_add_neon_pass2|

+    EXPORT  |vp9_idct16x16_10_add_neon_pass1|

+    EXPORT  |vp9_idct16x16_10_add_neon_pass2|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.

+    MACRO

+    TRANSPOSE8X8

+    vswp            d17, d24

+    vswp            d23, d30

+    vswp            d21, d28

+    vswp            d19, d26

+    vtrn.32         q8, q10

+    vtrn.32         q9, q11

+    vtrn.32         q12, q14

+    vtrn.32         q13, q15

+    vtrn.16         q8, q9

+    vtrn.16         q10, q11

+    vtrn.16         q12, q13

+    vtrn.16         q14, q15

+    MEND

+    AREA    Block, CODE, READONLY ; name this block of code

+;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,

+;                                          int16_t *output, int output_stride)

+;

+; r0  int16_t input

+; r1  int16_t *output

+; r2  int  output_stride)

+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

+; will be stored back into q8-q15 registers. This function will touch q0-q7

+; registers and use them as buffer during calculation.

+|vp9_idct16x16_256_add_neon_pass1| PROC

+    ; TODO(hkuang): Find a better way to load the elements.

+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

+    vld2.s16        {q8,q9}, [r0]!

+    vld2.s16        {q9,q10}, [r0]!

+    vld2.s16        {q10,q11}, [r0]!

+    vld2.s16        {q11,q12}, [r0]!

+    vld2.s16        {q12,q13}, [r0]!

+    vld2.s16        {q13,q14}, [r0]!

+    vld2.s16        {q14,q15}, [r0]!

+    vld2.s16        {q1,q2}, [r0]!

+    vmov.s16        q15, q1

+    ; generate  cospi_28_64 = 3196

+    mov             r3, #0xc00

+    add             r3, #0x7c

+    ; generate cospi_4_64  = 16069

+    mov             r12, #0x3e00

+    add             r12, #0xc5

+    ; transpose the input data

+    TRANSPOSE8X8

+    ; stage 3

+    vdup.16         d0, r3                    ; duplicate cospi_28_64

+    vdup.16         d1, r12                   ; duplicate cospi_4_64

+    ; preloading to avoid stall

+    ; generate cospi_12_64 = 13623

+    mov             r3, #0x3500

+    add             r3, #0x37

+    ; generate cospi_20_64 = 9102

+    mov             r12, #0x2300

+    add             r12, #0x8e

+    ; step2[4] * cospi_28_64

+    vmull.s16       q2, d18, d0

+    vmull.s16       q3, d19, d0

+    ; step2[4] * cospi_4_64

+    vmull.s16       q5, d18, d1

+    vmull.s16       q6, d19, d1

+    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64

+    vmlsl.s16       q2, d30, d1

+    vmlsl.s16       q3, d31, d1

+    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64

+    vmlal.s16       q5, d30, d0

+    vmlal.s16       q6, d31, d0

+    vdup.16         d2, r3                    ; duplicate cospi_12_64

+    vdup.16         d3, r12                   ; duplicate cospi_20_64

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d8, q2, #14               ; >> 14

+    vqrshrn.s32     d9, q3, #14               ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d14, q5, #14              ; >> 14

+    vqrshrn.s32     d15, q6, #14              ; >> 14

+    ; preloading to avoid stall

+    ; generate cospi_16_64 = 11585

+    mov             r3, #0x2d00

+    add             r3, #0x41

+    ; generate cospi_24_64 = 6270

+    mov             r12, #0x1800

+    add             r12, #0x7e

+    ; step2[5] * cospi_12_64

+    vmull.s16       q2, d26, d2

+    vmull.s16       q3, d27, d2

+    ; step2[5] * cospi_20_64

+    vmull.s16       q9, d26, d3

+    vmull.s16       q15, d27, d3

+    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64

+    vmlsl.s16       q2, d22, d3

+    vmlsl.s16       q3, d23, d3

+    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64

+    vmlal.s16       q9, d22, d2

+    vmlal.s16       q15, d23, d2

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d10, q2, #14              ; >> 14

+    vqrshrn.s32     d11, q3, #14              ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d12, q9, #14              ; >> 14

+    vqrshrn.s32     d13, q15, #14             ; >> 14

+    ; stage 4

+    vdup.16         d30, r3                   ; cospi_16_64

+    ; step1[0] * cospi_16_64

+    vmull.s16       q2, d16, d30

+    vmull.s16       q11, d17, d30

+    ; step1[1] * cospi_16_64

+    vmull.s16       q0, d24, d30

+    vmull.s16       q1, d25, d30

+    ; generate cospi_8_64 = 15137

+    mov             r3, #0x3b00

+    add             r3, #0x21

+    vdup.16         d30, r12                  ; duplicate cospi_24_64

+    vdup.16         d31, r3                   ; duplicate cospi_8_64

+    ; temp1 = (step1[0] + step1[1]) * cospi_16_64

+    vadd.s32        q3, q2, q0

+    vadd.s32        q12, q11, q1

+    ; temp2 = (step1[0] - step1[1]) * cospi_16_64

+    vsub.s32        q13, q2, q0

+    vsub.s32        q1, q11, q1

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d16, q3, #14              ; >> 14

+    vqrshrn.s32     d17, q12, #14             ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d18, q13, #14             ; >> 14

+    vqrshrn.s32     d19, q1, #14              ; >> 14

+    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+    ; step1[2] * cospi_8_64

+    vmull.s16       q0, d20, d31

+    vmull.s16       q1, d21, d31

+    ; step1[2] * cospi_24_64

+    vmull.s16       q12, d20, d30

+    vmull.s16       q13, d21, d30

+    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64

+    vmlal.s16       q0, d28, d30

+    vmlal.s16       q1, d29, d30

+    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64

+    vmlsl.s16       q12, d28, d31

+    vmlsl.s16       q13, d29, d31

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d22, q0, #14              ; >> 14

+    vqrshrn.s32     d23, q1, #14              ; >> 14

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d20, q12, #14             ; >> 14

+    vqrshrn.s32     d21, q13, #14             ; >> 14

+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];

+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];

+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];

+    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];

+    ; generate cospi_16_64 = 11585

+    mov             r3, #0x2d00

+    add             r3, #0x41

+    ; stage 5

+    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];

+    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];

+    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];

+    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];

+    vdup.16         d16, r3;                  ; duplicate cospi_16_64

+    ; step2[5] * cospi_16_64

+    vmull.s16       q11, d26, d16

+    vmull.s16       q12, d27, d16

+    ; step2[6] * cospi_16_64

+    vmull.s16       q9, d28, d16

+    vmull.s16       q10, d29, d16

+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64

+    vsub.s32        q6, q9, q11

+    vsub.s32        q13, q10, q12

+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64

+    vadd.s32        q9, q9, q11

+    vadd.s32        q10, q10, q12

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d10, q6, #14              ; >> 14

+    vqrshrn.s32     d11, q13, #14             ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d12, q9, #14              ; >> 14

+    vqrshrn.s32     d13, q10, #14             ; >> 14

+    ; stage 6

+    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];

+    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];

+    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];

+    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];

+    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];

+    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];

+    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];

+    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];

+    ; store the data

+    vst1.64         {d16}, [r1], r2

+    vst1.64         {d17}, [r1], r2

+    vst1.64         {d18}, [r1], r2

+    vst1.64         {d19}, [r1], r2

+    vst1.64         {d20}, [r1], r2

+    vst1.64         {d21}, [r1], r2

+    vst1.64         {d22}, [r1], r2

+    vst1.64         {d23}, [r1], r2

+    vst1.64         {d24}, [r1], r2

+    vst1.64         {d25}, [r1], r2

+    vst1.64         {d26}, [r1], r2

+    vst1.64         {d27}, [r1], r2

+    vst1.64         {d28}, [r1], r2

+    vst1.64         {d29}, [r1], r2

+    vst1.64         {d30}, [r1], r2

+    vst1.64         {d31}, [r1], r2

+    bx              lr

+    ENDP  ; |vp9_idct16x16_256_add_neon_pass1|

+;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,

+;                                        int16_t *output,

+;                                        int16_t *pass1Output,

+;                                        int16_t skip_adding,

+;                                        uint8_t *dest,

+;                                        int dest_stride)

+;

+; r0  int16_t *src

+; r1  int16_t *output,

+; r2  int16_t *pass1Output,

+; r3  int16_t skip_adding,

+; r4  uint8_t *dest,

+; r5  int dest_stride)

+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

+; will be stored back into q8-q15 registers. This function will touch q0-q7

+; registers and use them as buffer during calculation.

+|vp9_idct16x16_256_add_neon_pass2| PROC

+    push            {r3-r9}

+    ; TODO(hkuang): Find a better way to load the elements.

+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15

+    vld2.s16        {q8,q9}, [r0]!

+    vld2.s16        {q9,q10}, [r0]!

+    vld2.s16        {q10,q11}, [r0]!

+    vld2.s16        {q11,q12}, [r0]!

+    vld2.s16        {q12,q13}, [r0]!

+    vld2.s16        {q13,q14}, [r0]!

+    vld2.s16        {q14,q15}, [r0]!

+    vld2.s16        {q0,q1}, [r0]!

+    vmov.s16        q15, q0;

+    ; generate  cospi_30_64 = 1606

+    mov             r3, #0x0600

+    add             r3, #0x46

+    ; generate cospi_2_64  = 16305

+    mov             r12, #0x3f00

+    add             r12, #0xb1

+    ; transpose the input data

+    TRANSPOSE8X8

+    ; stage 3

+    vdup.16         d12, r3                   ; duplicate cospi_30_64

+    vdup.16         d13, r12                  ; duplicate cospi_2_64

+    ; preloading to avoid stall

+    ; generate cospi_14_64 = 12665

+    mov             r3, #0x3100

+    add             r3, #0x79

+    ; generate cospi_18_64 = 10394

+    mov             r12, #0x2800

+    add             r12, #0x9a

+    ; step1[8] * cospi_30_64

+    vmull.s16       q2, d16, d12

+    vmull.s16       q3, d17, d12

+    ; step1[8] * cospi_2_64

+    vmull.s16       q1, d16, d13

+    vmull.s16       q4, d17, d13

+    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64

+    vmlsl.s16       q2, d30, d13

+    vmlsl.s16       q3, d31, d13

+    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64

+    vmlal.s16       q1, d30, d12

+    vmlal.s16       q4, d31, d12

+    vdup.16         d30, r3                   ; duplicate cospi_14_64

+    vdup.16         d31, r12                  ; duplicate cospi_18_64

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d0, q2, #14               ; >> 14

+    vqrshrn.s32     d1, q3, #14               ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d14, q1, #14              ; >> 14

+    vqrshrn.s32     d15, q4, #14              ; >> 14

+    ; preloading to avoid stall

+    ; generate cospi_22_64 = 7723

+    mov             r3, #0x1e00

+    add             r3, #0x2b

+    ; generate cospi_10_64 = 14449

+    mov             r12, #0x3800

+    add             r12, #0x71

+    ; step1[9] * cospi_14_64

+    vmull.s16       q2, d24, d30

+    vmull.s16       q3, d25, d30

+    ; step1[9] * cospi_18_64

+    vmull.s16       q4, d24, d31

+    vmull.s16       q5, d25, d31

+    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64

+    vmlsl.s16       q2, d22, d31

+    vmlsl.s16       q3, d23, d31

+    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64

+    vmlal.s16       q4, d22, d30

+    vmlal.s16       q5, d23, d30

+    vdup.16         d30, r3                   ; duplicate cospi_22_64

+    vdup.16         d31, r12                  ; duplicate cospi_10_64

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d2, q2, #14               ; >> 14

+    vqrshrn.s32     d3, q3, #14               ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d12, q4, #14              ; >> 14

+    vqrshrn.s32     d13, q5, #14              ; >> 14

+    ; step1[10] * cospi_22_64

+    vmull.s16       q11, d20, d30

+    vmull.s16       q12, d21, d30

+    ; step1[10] * cospi_10_64

+    vmull.s16       q4, d20, d31

+    vmull.s16       q5, d21, d31

+    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64

+    vmlsl.s16       q11, d26, d31

+    vmlsl.s16       q12, d27, d31

+    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64

+    vmlal.s16       q4, d26, d30

+    vmlal.s16       q5, d27, d30

+    ; preloading to avoid stall

+    ; generate cospi_6_64 = 15679

+    mov             r3, #0x3d00

+    add             r3, #0x3f

+    ; generate cospi_26_64 = 4756

+    mov             r12, #0x1200

+    add             r12, #0x94

+    vdup.16         d30, r3                   ; duplicate cospi_6_64

+    vdup.16         d31, r12                  ; duplicate cospi_26_64

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d4, q11, #14              ; >> 14

+    vqrshrn.s32     d5, q12, #14              ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d11, q5, #14              ; >> 14

+    vqrshrn.s32     d10, q4, #14              ; >> 14

+    ; step1[11] * cospi_6_64

+    vmull.s16       q10, d28, d30

+    vmull.s16       q11, d29, d30

+    ; step1[11] * cospi_26_64

+    vmull.s16       q12, d28, d31

+    vmull.s16       q13, d29, d31

+    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64

+    vmlsl.s16       q10, d18, d31

+    vmlsl.s16       q11, d19, d31

+    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64

+    vmlal.s16       q12, d18, d30

+    vmlal.s16       q13, d19, d30

+    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]

+    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d6, q10, #14              ; >> 14

+    vqrshrn.s32     d7, q11, #14              ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d8, q12, #14              ; >> 14

+    vqrshrn.s32     d9, q13, #14              ; >> 14

+    ; stage 3

+    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]

+    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]

+    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]

+    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]

+    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]

+    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]

+    ; stage 4

+    ; generate cospi_24_64 = 6270

+    mov             r3, #0x1800

+    add             r3, #0x7e

+    ; generate cospi_8_64 = 15137

+    mov             r12, #0x3b00

+    add             r12, #0x21

+    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64

+    vdup.16         d30, r12                  ; duplicate cospi_8_64

+    vdup.16         d31, r3                   ; duplicate cospi_24_64

+    ; step1[9] * cospi_24_64

+    vmull.s16       q2, d18, d31

+    vmull.s16       q3, d19, d31

+    ; step1[14] * cospi_24_64

+    vmull.s16       q4, d28, d31

+    vmull.s16       q5, d29, d31

+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64

+    vmlal.s16       q2, d28, d30

+    vmlal.s16       q3, d29, d30

+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64

+    vmlsl.s16       q4, d18, d30

+    vmlsl.s16       q5, d19, d30

+    rsb             r12, #0

+    vdup.16         d30, r12                  ; duplicate -cospi_8_64

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d12, q2, #14              ; >> 14

+    vqrshrn.s32     d13, q3, #14              ; >> 14

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d2, q4, #14               ; >> 14

+    vqrshrn.s32     d3, q5, #14               ; >> 14

+    vmov.s16        q3, q11

+    vmov.s16        q4, q12

+    ; - step1[13] * cospi_8_64

+    vmull.s16       q11, d26, d30

+    vmull.s16       q12, d27, d30

+    ; -step1[10] * cospi_8_64

+    vmull.s16       q8, d20, d30

+    vmull.s16       q9, d21, d30

+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64

+    vmlsl.s16       q11, d20, d31

+    vmlsl.s16       q12, d21, d31

+    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64

+    vmlal.s16       q8, d26, d31

+    vmlal.s16       q9, d27, d31

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d4, q11, #14              ; >> 14

+    vqrshrn.s32     d5, q12, #14              ; >> 14

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d10, q8, #14              ; >> 14

+    vqrshrn.s32     d11, q9, #14              ; >> 14

+    ; stage 5

+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];

+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];

+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];

+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];

+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];

+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];

+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];

+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];

+    ; stage 6.

+    ; generate cospi_16_64 = 11585

+    mov             r12, #0x2d00

+    add             r12, #0x41

+    vdup.16         d14, r12                  ; duplicate cospi_16_64

+    ; step1[13] * cospi_16_64

+    vmull.s16       q3, d26, d14

+    vmull.s16       q4, d27, d14

+    ; step1[10] * cospi_16_64

+    vmull.s16       q0, d20, d14

+    vmull.s16       q1, d21, d14

+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64

+    vsub.s32        q5, q3, q0

+    vsub.s32        q6, q4, q1

+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64

+    vadd.s32        q10, q3, q0

+    vadd.s32        q4, q4, q1

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d4, q5, #14               ; >> 14

+    vqrshrn.s32     d5, q6, #14               ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d10, q10, #14             ; >> 14

+    vqrshrn.s32     d11, q4, #14              ; >> 14

+    ; step1[11] * cospi_16_64

+    vmull.s16       q0, d22, d14

+    vmull.s16       q1, d23, d14

+    ; step1[12] * cospi_16_64

+    vmull.s16       q13, d24, d14

+    vmull.s16       q6, d25, d14

+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64

+    vsub.s32        q10, q13, q0

+    vsub.s32        q4, q6, q1

+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64

+    vadd.s32        q13, q13, q0

+    vadd.s32        q6, q6, q1

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d6, q10, #14              ; >> 14

+    vqrshrn.s32     d7, q4, #14               ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d8, q13, #14              ; >> 14

+    vqrshrn.s32     d9, q6, #14               ; >> 14

+    mov              r4, #16                  ; pass1Output stride

+    ldr              r3, [sp]                 ; load skip_adding

+    cmp              r3, #0                   ; check if need adding dest data

+    beq              skip_adding_dest

+    ldr              r7, [sp, #28]            ; dest used to save element 0-7

+    mov              r9, r7                   ; save dest pointer for later use

+    ldr              r8, [sp, #32]            ; load dest_stride

+    ; stage 7

+    ; load the data in pass1

+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]

+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]

+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]

+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]

+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]

+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q12                  ; clip pixel

+    vqmovun.s16     d13, q13                  ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vst1.64         {d13}, [r9], r8           ; store the data

+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]

+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]

+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]

+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q12                  ; clip pixel

+    vqmovun.s16     d13, q13                  ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vst1.64         {d13}, [r9], r8           ; store the data

+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]

+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]

+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]

+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]

+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]

+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]

+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]

+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q12                  ; clip pixel

+    vqmovun.s16     d13, q13                  ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vst1.64         {d13}, [r9], r8           ; store the data

+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]

+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]

+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]

+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO

+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO

+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]

+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q12                  ; clip pixel

+    vqmovun.s16     d13, q13                  ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vst1.64         {d13}, [r9], r8           ; store the data

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]

+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]

+    ; store the data  output 8,9,10,11,12,13,14,15

+    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO

+    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q8                   ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vrshr.s16       q9, q9, #6

+    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]

+    vqmovun.s16     d13, q9                   ; clip pixel

+    vst1.64         {d13}, [r9], r8           ; store the data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vrshr.s16       q2, q2, #6

+    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q2                   ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vrshr.s16       q3, q3, #6

+    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]

+    vqmovun.s16     d13, q3                   ; clip pixel

+    vst1.64         {d13}, [r9], r8           ; store the data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vrshr.s16       q4, q4, #6

+    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q4                   ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vrshr.s16       q5, q5, #6

+    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]

+    vqmovun.s16     d13, q5                   ; clip pixel

+    vst1.64         {d13}, [r9], r8           ; store the data

+    vld1.64         {d13}, [r7], r8           ; load destinatoin data

+    vrshr.s16       q14, q14, #6

+    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]

+    vqmovun.s16     d12, q14                  ; clip pixel

+    vst1.64         {d12}, [r9], r8           ; store the data

+    vld1.64         {d12}, [r7], r8           ; load destinatoin data

+    vrshr.s16       q15, q15, #6

+    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]

+    vqmovun.s16     d13, q15                  ; clip pixel

+    vst1.64         {d13}, [r9], r8           ; store the data

+    b               end_idct16x16_pass2

+skip_adding_dest

+    ; stage 7

+    ; load the data in pass1

+    mov              r5, #24

+    mov              r3, #8

+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]

+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]

+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]

+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]

+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]

+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]

+    vst1.64         {d24}, [r1], r3           ; store output[0]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[1]

+    vst1.64         {d27}, [r1], r5

+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]

+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]

+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]

+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]

+    vst1.64         {d24}, [r1], r3           ; store output[2]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[3]

+    vst1.64         {d27}, [r1], r5

+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]

+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]

+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]

+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]

+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]

+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]

+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]

+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]

+    vst1.64         {d24}, [r1], r3           ; store output[4]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[5]

+    vst1.64         {d27}, [r1], r5

+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]

+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]

+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]

+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]

+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]

+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]

+    vst1.64         {d24}, [r1], r3           ; store output[6]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[7]

+    vst1.64         {d27}, [r1], r5

+    ; store the data  output 8,9,10,11,12,13,14,15

+    vst1.64         {d16}, [r1], r3

+    vst1.64         {d17}, [r1], r5

+    vst1.64         {d18}, [r1], r3

+    vst1.64         {d19}, [r1], r5

+    vst1.64         {d4}, [r1], r3

+    vst1.64         {d5}, [r1], r5

+    vst1.64         {d6}, [r1], r3

+    vst1.64         {d7}, [r1], r5

+    vst1.64         {d8}, [r1], r3

+    vst1.64         {d9}, [r1], r5

+    vst1.64         {d10}, [r1], r3

+    vst1.64         {d11}, [r1], r5

+    vst1.64         {d28}, [r1], r3

+    vst1.64         {d29}, [r1], r5

+    vst1.64         {d30}, [r1], r3

+    vst1.64         {d31}, [r1], r5

+end_idct16x16_pass2

+    pop             {r3-r9}

+    bx              lr

+    ENDP  ; |vp9_idct16x16_256_add_neon_pass2|

+;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,

+;                                             int16_t *output, int output_stride)

+;

+; r0  int16_t input

+; r1  int16_t *output

+; r2  int  output_stride)

+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output

+; will be stored back into q8-q15 registers. This function will touch q0-q7

+; registers and use them as buffer during calculation.

+|vp9_idct16x16_10_add_neon_pass1| PROC

+    ; TODO(hkuang): Find a better way to load the elements.

+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15

+    vld2.s16        {q8,q9}, [r0]!

+    vld2.s16        {q9,q10}, [r0]!

+    vld2.s16        {q10,q11}, [r0]!

+    vld2.s16        {q11,q12}, [r0]!

+    vld2.s16        {q12,q13}, [r0]!

+    vld2.s16        {q13,q14}, [r0]!

+    vld2.s16        {q14,q15}, [r0]!

+    vld2.s16        {q1,q2}, [r0]!

+    vmov.s16        q15, q1

+    ; generate  cospi_28_64*2 = 6392

+    mov             r3, #0x1800

+    add             r3, #0xf8

+    ; generate cospi_4_64*2  = 32138

+    mov             r12, #0x7d00

+    add             r12, #0x8a

+    ; transpose the input data

+    TRANSPOSE8X8

+    ; stage 3

+    vdup.16         q0, r3                    ; duplicate cospi_28_64*2

+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2

+    ; The following instructions use vqrdmulh to do the

+    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,

+    ; double, and return the high 16 bits, effectively giving >> 15. Doubling

+    ; the constant will change this to >> 14.

+    ; dct_const_round_shift(step2[4] * cospi_28_64);

+    vqrdmulh.s16    q4, q9, q0

+    ; preloading to avoid stall

+    ; generate cospi_16_64*2 = 23170

+    mov             r3, #0x5a00

+    add             r3, #0x82

+    ; dct_const_round_shift(step2[4] * cospi_4_64);

+    vqrdmulh.s16    q7, q9, q1

+    ; stage 4

+    vdup.16         q1, r3                    ; cospi_16_64*2

+    ; generate cospi_16_64 = 11585

+    mov             r3, #0x2d00

+    add             r3, #0x41

+    vdup.16         d4, r3;                   ; duplicate cospi_16_64

+    ; dct_const_round_shift(step1[0] * cospi_16_64)

+    vqrdmulh.s16    q8, q8, q1

+    ; step2[6] * cospi_16_64

+    vmull.s16       q9, d14, d4

+    vmull.s16       q10, d15, d4

+    ; step2[5] * cospi_16_64

+    vmull.s16       q12, d9, d4

+    vmull.s16       q11, d8, d4

+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64

+    vsub.s32        q15, q10, q12

+    vsub.s32        q6, q9, q11

+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64

+    vadd.s32        q9, q9, q11

+    vadd.s32        q10, q10, q12

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d11, q15, #14             ; >> 14

+    vqrshrn.s32     d10, q6, #14              ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d12, q9, #14              ; >> 14

+    vqrshrn.s32     d13, q10, #14             ; >> 14

+    ; stage 6

+    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];

+    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];

+    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];

+    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];

+    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];

+    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];

+    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];

+    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];

+    ; store the data

+    vst1.64         {d4}, [r1], r2

+    vst1.64         {d5}, [r1], r2

+    vst1.64         {d18}, [r1], r2

+    vst1.64         {d19}, [r1], r2

+    vst1.64         {d20}, [r1], r2

+    vst1.64         {d21}, [r1], r2

+    vst1.64         {d22}, [r1], r2

+    vst1.64         {d23}, [r1], r2

+    vst1.64         {d24}, [r1], r2

+    vst1.64         {d25}, [r1], r2

+    vst1.64         {d26}, [r1], r2

+    vst1.64         {d27}, [r1], r2

+    vst1.64         {d28}, [r1], r2

+    vst1.64         {d29}, [r1], r2

+    vst1.64         {d30}, [r1], r2

+    vst1.64         {d31}, [r1], r2

+    bx              lr

+    ENDP  ; |vp9_idct16x16_10_add_neon_pass1|

+;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,

+;                                           int16_t *output,

+;                                           int16_t *pass1Output,

+;                                           int16_t skip_adding,

+;                                           uint8_t *dest,

+;                                           int dest_stride)

+;

+; r0  int16_t *src

+; r1  int16_t *output,

+; r2  int16_t *pass1Output,

+; r3  int16_t skip_adding,

+; r4  uint8_t *dest,

+; r5  int dest_stride)

+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output

+; will be stored back into q8-q15 registers. This function will touch q0-q7

+; registers and use them as buffer during calculation.

+|vp9_idct16x16_10_add_neon_pass2| PROC

+    push            {r3-r9}

+    ; TODO(hkuang): Find a better way to load the elements.

+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15

+    vld2.s16        {q8,q9}, [r0]!

+    vld2.s16        {q9,q10}, [r0]!

+    vld2.s16        {q10,q11}, [r0]!

+    vld2.s16        {q11,q12}, [r0]!

+    vld2.s16        {q12,q13}, [r0]!

+    vld2.s16        {q13,q14}, [r0]!

+    vld2.s16        {q14,q15}, [r0]!

+    vld2.s16        {q0,q1}, [r0]!

+    vmov.s16        q15, q0;

+    ; generate 2*cospi_30_64 = 3212

+    mov             r3, #0xc00

+    add             r3, #0x8c

+    ; generate 2*cospi_2_64  = 32610

+    mov             r12, #0x7f00

+    add             r12, #0x62

+    ; transpose the input data

+    TRANSPOSE8X8

+    ; stage 3

+    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64

+    ; dct_const_round_shift(step1[8] * cospi_30_64)

+    vqrdmulh.s16    q0, q8, q6

+    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64

+    ; dct_const_round_shift(step1[8] * cospi_2_64)

+    vqrdmulh.s16    q7, q8, q6

+    ; preloading to avoid stall

+    ; generate 2*cospi_26_64 = 9512

+    mov             r12, #0x2500

+    add             r12, #0x28

+    rsb             r12, #0

+    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64

+    ; generate 2*cospi_6_64 = 31358

+    mov             r3, #0x7a00

+    add             r3, #0x7e

+    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64

+    ; dct_const_round_shift(- step1[12] * cospi_26_64)

+    vqrdmulh.s16    q3, q9, q15

+    ; dct_const_round_shift(step1[12] * cospi_6_64)

+    vqrdmulh.s16    q4, q9, q14

+    ; stage 4

+    ; generate cospi_24_64 = 6270

+    mov             r3, #0x1800

+    add             r3, #0x7e

+    vdup.16         d31, r3                   ; duplicate cospi_24_64

+    ; generate cospi_8_64 = 15137

+    mov             r12, #0x3b00

+    add             r12, #0x21

+    vdup.16         d30, r12                  ; duplicate cospi_8_64

+    ; step1[14] * cospi_24_64

+    vmull.s16       q12, d14, d31

+    vmull.s16       q5, d15, d31

+    ; step1[9] * cospi_24_64

+    vmull.s16       q2, d0, d31

+    vmull.s16       q11, d1, d31

+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64

+    vmlsl.s16       q12, d0, d30

+    vmlsl.s16       q5, d1, d30

+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64

+    vmlal.s16       q2, d14, d30

+    vmlal.s16       q11, d15, d30

+    rsb              r12, #0

+    vdup.16          d30, r12                 ; duplicate -cospi_8_64

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d2, q12, #14              ; >> 14

+    vqrshrn.s32     d3, q5, #14               ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d12, q2, #14              ; >> 14

+    vqrshrn.s32     d13, q11, #14             ; >> 14

+    ; - step1[13] * cospi_8_64

+    vmull.s16       q10, d8, d30

+    vmull.s16       q13, d9, d30

+    ; -step1[10] * cospi_8_64

+    vmull.s16       q8, d6, d30

+    vmull.s16       q9, d7, d30

+    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64

+    vmlsl.s16       q10, d6, d31

+    vmlsl.s16       q13, d7, d31

+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64

+    vmlal.s16       q8, d8, d31

+    vmlal.s16       q9, d9, d31

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d4, q10, #14              ; >> 14

+    vqrshrn.s32     d5, q13, #14              ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d10, q8, #14              ; >> 14

+    vqrshrn.s32     d11, q9, #14              ; >> 14

+    ; stage 5

+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];

+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];

+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];

+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];

+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];

+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];

+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];

+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];

+    ; stage 6.

+    ; generate cospi_16_64 = 11585

+    mov             r12, #0x2d00

+    add             r12, #0x41

+    vdup.16         d14, r12                  ; duplicate cospi_16_64

+    ; step1[13] * cospi_16_64

+    vmull.s16       q3, d26, d14

+    vmull.s16       q4, d27, d14

+    ; step1[10] * cospi_16_64

+    vmull.s16       q0, d20, d14

+    vmull.s16       q1, d21, d14

+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64

+    vsub.s32        q5, q3, q0

+    vsub.s32        q6, q4, q1

+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64

+    vadd.s32        q0, q3, q0

+    vadd.s32        q1, q4, q1

+    ; dct_const_round_shift(temp1)

+    vqrshrn.s32     d4, q5, #14               ; >> 14

+    vqrshrn.s32     d5, q6, #14               ; >> 14

+    ; dct_const_round_shift(temp2)

+    vqrshrn.s32     d10, q0, #14              ; >> 14

+    vqrshrn.s32     d11, q1, #14              ; >> 14

+    ; step1[11] * cospi_16_64

+    vmull.s16       q0, d22, d14

+    vmull.s16       q1, d23, d14

+    ; step1[12] * cospi_16_64

+    vmull.s16       q13, d24, d14

+    vmull.s16       q6, d25, d14

+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64

+    vsub.s32        q10, q13, q0

+    vsub.s32        q4, q6, q1

+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64

+    vadd.s32        q13, q13, q0

+    vadd.s32        q6, q6, q1

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d6, q10, #14              ; >> 14

+    vqrshrn.s32     d7, q4, #14               ; >> 14

+    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);

+    vqrshrn.s32     d8, q13, #14              ; >> 14

+    vqrshrn.s32     d9, q6, #14               ; >> 14

+    mov              r4, #16                  ; pass1Output stride

+    ldr              r3, [sp]                 ; load skip_adding

+    ; stage 7

+    ; load the data in pass1

+    mov              r5, #24

+    mov              r3, #8

+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]

+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]

+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]

+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]

+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]

+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]

+    vst1.64         {d24}, [r1], r3           ; store output[0]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[1]

+    vst1.64         {d27}, [r1], r5

+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]

+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]

+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]

+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]

+    vst1.64         {d24}, [r1], r3           ; store output[2]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[3]

+    vst1.64         {d27}, [r1], r5

+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]

+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]

+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]

+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]

+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]

+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]

+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]

+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]

+    vst1.64         {d24}, [r1], r3           ; store output[4]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[5]

+    vst1.64         {d27}, [r1], r5

+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]

+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]

+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]

+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]

+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]

+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]

+    vst1.64         {d24}, [r1], r3           ; store output[6]

+    vst1.64         {d25}, [r1], r5

+    vst1.64         {d26}, [r1], r3           ; store output[7]

+    vst1.64         {d27}, [r1], r5

+    ; store the data  output 8,9,10,11,12,13,14,15

+    vst1.64         {d16}, [r1], r3

+    vst1.64         {d17}, [r1], r5

+    vst1.64         {d18}, [r1], r3

+    vst1.64         {d19}, [r1], r5

+    vst1.64         {d4}, [r1], r3

+    vst1.64         {d5}, [r1], r5

+    vst1.64         {d6}, [r1], r3

+    vst1.64         {d7}, [r1], r5

+    vst1.64         {d8}, [r1], r3

+    vst1.64         {d9}, [r1], r5

+    vst1.64         {d10}, [r1], r3

+    vst1.64         {d11}, [r1], r5

+    vst1.64         {d28}, [r1], r3

+    vst1.64         {d29}, [r1], r5

+    vst1.64         {d30}, [r1], r3

+    vst1.64         {d31}, [r1], r5

+end_idct10_16x16_pass2

+    pop             {r3-r9}

+    bx              lr

+    ENDP  ; |vp9_idct16x16_10_add_neon_pass2|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct16x16_add_neon.c

@@ -1,0 +1,1317 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_config.h"

+#include "vpx_dsp/txfm_common.h"

+static INLINE void TRANSPOSE8X8(

+        int16x8_t *q8s16,

+        int16x8_t *q9s16,

+        int16x8_t *q10s16,

+        int16x8_t *q11s16,

+        int16x8_t *q12s16,

+        int16x8_t *q13s16,

+        int16x8_t *q14s16,

+        int16x8_t *q15s16) {

+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;

+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;

+    d16s16 = vget_low_s16(*q8s16);

+    d17s16 = vget_high_s16(*q8s16);

+    d18s16 = vget_low_s16(*q9s16);

+    d19s16 = vget_high_s16(*q9s16);

+    d20s16 = vget_low_s16(*q10s16);

+    d21s16 = vget_high_s16(*q10s16);

+    d22s16 = vget_low_s16(*q11s16);

+    d23s16 = vget_high_s16(*q11s16);

+    d24s16 = vget_low_s16(*q12s16);

+    d25s16 = vget_high_s16(*q12s16);

+    d26s16 = vget_low_s16(*q13s16);

+    d27s16 = vget_high_s16(*q13s16);

+    d28s16 = vget_low_s16(*q14s16);

+    d29s16 = vget_high_s16(*q14s16);

+    d30s16 = vget_low_s16(*q15s16);

+    d31s16 = vget_high_s16(*q15s16);

+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24

+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26

+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28

+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30

+    *q12s16 = vcombine_s16(d17s16, d25s16);

+    *q13s16 = vcombine_s16(d19s16, d27s16);

+    *q14s16 = vcombine_s16(d21s16, d29s16);

+    *q15s16 = vcombine_s16(d23s16, d31s16);

+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),

+                        vreinterpretq_s32_s16(*q10s16));

+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),

+                        vreinterpretq_s32_s16(*q11s16));

+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),

+                        vreinterpretq_s32_s16(*q14s16));

+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),

+                        vreinterpretq_s32_s16(*q15s16));

+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8

+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9

+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10

+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11

+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12

+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13

+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14

+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15

+    *q8s16  = q0x2s16.val[0];

+    *q9s16  = q0x2s16.val[1];

+    *q10s16 = q1x2s16.val[0];

+    *q11s16 = q1x2s16.val[1];

+    *q12s16 = q2x2s16.val[0];

+    *q13s16 = q2x2s16.val[1];

+    *q14s16 = q3x2s16.val[0];

+    *q15s16 = q3x2s16.val[1];

+    return;

+}

+void vp9_idct16x16_256_add_neon_pass1(

+        int16_t *in,

+        int16_t *out,

+        int output_stride) {

+    int16x4_t d0s16, d1s16, d2s16, d3s16;

+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

+    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;

+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;

+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;

+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;

+    int16x8x2_t q0x2s16;

+    q0x2s16 = vld2q_s16(in);

+    q8s16  = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q9s16  = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q10s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q11s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q12s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q13s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q14s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q15s16 = q0x2s16.val[0];

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    d16s16 = vget_low_s16(q8s16);

+    d17s16 = vget_high_s16(q8s16);

+    d18s16 = vget_low_s16(q9s16);

+    d19s16 = vget_high_s16(q9s16);

+    d20s16 = vget_low_s16(q10s16);

+    d21s16 = vget_high_s16(q10s16);

+    d22s16 = vget_low_s16(q11s16);

+    d23s16 = vget_high_s16(q11s16);

+    d24s16 = vget_low_s16(q12s16);

+    d25s16 = vget_high_s16(q12s16);

+    d26s16 = vget_low_s16(q13s16);

+    d27s16 = vget_high_s16(q13s16);

+    d28s16 = vget_low_s16(q14s16);

+    d29s16 = vget_high_s16(q14s16);

+    d30s16 = vget_low_s16(q15s16);

+    d31s16 = vget_high_s16(q15s16);

+    // stage 3

+    d0s16 = vdup_n_s16(cospi_28_64);

+    d1s16 = vdup_n_s16(cospi_4_64);

+    q2s32 = vmull_s16(d18s16, d0s16);

+    q3s32 = vmull_s16(d19s16, d0s16);

+    q5s32 = vmull_s16(d18s16, d1s16);

+    q6s32 = vmull_s16(d19s16, d1s16);

+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);

+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);

+    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);

+    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);

+    d2s16 = vdup_n_s16(cospi_12_64);

+    d3s16 = vdup_n_s16(cospi_20_64);

+    d8s16 = vqrshrn_n_s32(q2s32, 14);

+    d9s16 = vqrshrn_n_s32(q3s32, 14);

+    d14s16 = vqrshrn_n_s32(q5s32, 14);

+    d15s16 = vqrshrn_n_s32(q6s32, 14);

+    q4s16 = vcombine_s16(d8s16, d9s16);

+    q7s16 = vcombine_s16(d14s16, d15s16);

+    q2s32 = vmull_s16(d26s16, d2s16);

+    q3s32 = vmull_s16(d27s16, d2s16);

+    q9s32 = vmull_s16(d26s16, d3s16);

+    q15s32 = vmull_s16(d27s16, d3s16);

+    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);

+    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);

+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);

+    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);

+    d10s16 = vqrshrn_n_s32(q2s32, 14);

+    d11s16 = vqrshrn_n_s32(q3s32, 14);

+    d12s16 = vqrshrn_n_s32(q9s32, 14);

+    d13s16 = vqrshrn_n_s32(q15s32, 14);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    // stage 4

+    d30s16 = vdup_n_s16(cospi_16_64);

+    q2s32 = vmull_s16(d16s16, d30s16);

+    q11s32 = vmull_s16(d17s16, d30s16);

+    q0s32 = vmull_s16(d24s16, d30s16);

+    q1s32 = vmull_s16(d25s16, d30s16);

+    d30s16 = vdup_n_s16(cospi_24_64);

+    d31s16 = vdup_n_s16(cospi_8_64);

+    q3s32 = vaddq_s32(q2s32, q0s32);

+    q12s32 = vaddq_s32(q11s32, q1s32);

+    q13s32 = vsubq_s32(q2s32, q0s32);

+    q1s32 = vsubq_s32(q11s32, q1s32);

+    d16s16 = vqrshrn_n_s32(q3s32, 14);

+    d17s16 = vqrshrn_n_s32(q12s32, 14);

+    d18s16 = vqrshrn_n_s32(q13s32, 14);

+    d19s16 = vqrshrn_n_s32(q1s32, 14);

+    q8s16 = vcombine_s16(d16s16, d17s16);

+    q9s16 = vcombine_s16(d18s16, d19s16);

+    q0s32 = vmull_s16(d20s16, d31s16);

+    q1s32 = vmull_s16(d21s16, d31s16);

+    q12s32 = vmull_s16(d20s16, d30s16);

+    q13s32 = vmull_s16(d21s16, d30s16);

+    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);

+    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);

+    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);

+    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);

+    d22s16 = vqrshrn_n_s32(q0s32, 14);

+    d23s16 = vqrshrn_n_s32(q1s32, 14);

+    d20s16 = vqrshrn_n_s32(q12s32, 14);

+    d21s16 = vqrshrn_n_s32(q13s32, 14);

+    q10s16 = vcombine_s16(d20s16, d21s16);

+    q11s16 = vcombine_s16(d22s16, d23s16);

+    q13s16 = vsubq_s16(q4s16, q5s16);

+    q4s16 = vaddq_s16(q4s16, q5s16);

+    q14s16 = vsubq_s16(q7s16, q6s16);

+    q15s16 = vaddq_s16(q6s16, q7s16);

+    d26s16 = vget_low_s16(q13s16);

+    d27s16 = vget_high_s16(q13s16);

+    d28s16 = vget_low_s16(q14s16);

+    d29s16 = vget_high_s16(q14s16);

+    // stage 5

+    q0s16 = vaddq_s16(q8s16, q11s16);

+    q1s16 = vaddq_s16(q9s16, q10s16);

+    q2s16 = vsubq_s16(q9s16, q10s16);

+    q3s16 = vsubq_s16(q8s16, q11s16);

+    d16s16 = vdup_n_s16(cospi_16_64);

+    q11s32 = vmull_s16(d26s16, d16s16);

+    q12s32 = vmull_s16(d27s16, d16s16);

+    q9s32 = vmull_s16(d28s16, d16s16);

+    q10s32 = vmull_s16(d29s16, d16s16);

+    q6s32 = vsubq_s32(q9s32, q11s32);

+    q13s32 = vsubq_s32(q10s32, q12s32);

+    q9s32 = vaddq_s32(q9s32, q11s32);

+    q10s32 = vaddq_s32(q10s32, q12s32);

+    d10s16 = vqrshrn_n_s32(q6s32, 14);

+    d11s16 = vqrshrn_n_s32(q13s32, 14);

+    d12s16 = vqrshrn_n_s32(q9s32, 14);

+    d13s16 = vqrshrn_n_s32(q10s32, 14);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    // stage 6

+    q8s16 = vaddq_s16(q0s16, q15s16);

+    q9s16 = vaddq_s16(q1s16, q6s16);

+    q10s16 = vaddq_s16(q2s16, q5s16);

+    q11s16 = vaddq_s16(q3s16, q4s16);

+    q12s16 = vsubq_s16(q3s16, q4s16);

+    q13s16 = vsubq_s16(q2s16, q5s16);

+    q14s16 = vsubq_s16(q1s16, q6s16);

+    q15s16 = vsubq_s16(q0s16, q15s16);

+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));

+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));

+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));

+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));

+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));

+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));

+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));

+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));

+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));

+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));

+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));

+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));

+    // store the data

+    output_stride >>= 1;  // output_stride / 2, out is int16_t

+    vst1_u64((uint64_t *)out, d16u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d17u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d18u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d19u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d20u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d21u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d22u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d23u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d24u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d25u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d26u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d27u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d28u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d29u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d30u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d31u64);

+    return;

+}

+void vp9_idct16x16_256_add_neon_pass2(

+        int16_t *src,

+        int16_t *out,

+        int16_t *pass1Output,

+        int16_t skip_adding,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8_t *d;

+    uint8x8_t d12u8, d13u8;

+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;

+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

+    uint64x1_t d24u64, d25u64, d26u64, d27u64;

+    int64x1_t d12s64, d13s64;

+    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;

+    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;

+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;

+    int32x4_t q10s32, q11s32, q12s32, q13s32;

+    int16x8x2_t q0x2s16;

+    q0x2s16 = vld2q_s16(src);

+    q8s16  = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q9s16  = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q10s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q11s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q12s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q13s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q14s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q15s16 = q0x2s16.val[0];

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    d16s16 = vget_low_s16(q8s16);

+    d17s16 = vget_high_s16(q8s16);

+    d18s16 = vget_low_s16(q9s16);

+    d19s16 = vget_high_s16(q9s16);

+    d20s16 = vget_low_s16(q10s16);

+    d21s16 = vget_high_s16(q10s16);

+    d22s16 = vget_low_s16(q11s16);

+    d23s16 = vget_high_s16(q11s16);

+    d24s16 = vget_low_s16(q12s16);

+    d25s16 = vget_high_s16(q12s16);

+    d26s16 = vget_low_s16(q13s16);

+    d27s16 = vget_high_s16(q13s16);

+    d28s16 = vget_low_s16(q14s16);

+    d29s16 = vget_high_s16(q14s16);

+    d30s16 = vget_low_s16(q15s16);

+    d31s16 = vget_high_s16(q15s16);

+    // stage 3

+    d12s16 = vdup_n_s16(cospi_30_64);

+    d13s16 = vdup_n_s16(cospi_2_64);

+    q2s32 = vmull_s16(d16s16, d12s16);

+    q3s32 = vmull_s16(d17s16, d12s16);

+    q1s32 = vmull_s16(d16s16, d13s16);

+    q4s32 = vmull_s16(d17s16, d13s16);

+    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);

+    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);

+    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);

+    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);

+    d0s16 = vqrshrn_n_s32(q2s32, 14);

+    d1s16 = vqrshrn_n_s32(q3s32, 14);

+    d14s16 = vqrshrn_n_s32(q1s32, 14);

+    d15s16 = vqrshrn_n_s32(q4s32, 14);

+    q0s16 = vcombine_s16(d0s16, d1s16);

+    q7s16 = vcombine_s16(d14s16, d15s16);

+    d30s16 = vdup_n_s16(cospi_14_64);

+    d31s16 = vdup_n_s16(cospi_18_64);

+    q2s32 = vmull_s16(d24s16, d30s16);

+    q3s32 = vmull_s16(d25s16, d30s16);

+    q4s32 = vmull_s16(d24s16, d31s16);

+    q5s32 = vmull_s16(d25s16, d31s16);

+    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);

+    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);

+    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);

+    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);

+    d2s16 = vqrshrn_n_s32(q2s32, 14);

+    d3s16 = vqrshrn_n_s32(q3s32, 14);

+    d12s16 = vqrshrn_n_s32(q4s32, 14);

+    d13s16 = vqrshrn_n_s32(q5s32, 14);

+    q1s16 = vcombine_s16(d2s16, d3s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    d30s16 = vdup_n_s16(cospi_22_64);

+    d31s16 = vdup_n_s16(cospi_10_64);

+    q11s32 = vmull_s16(d20s16, d30s16);

+    q12s32 = vmull_s16(d21s16, d30s16);

+    q4s32 = vmull_s16(d20s16, d31s16);

+    q5s32 = vmull_s16(d21s16, d31s16);

+    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);

+    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);

+    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);

+    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);

+    d4s16 = vqrshrn_n_s32(q11s32, 14);

+    d5s16 = vqrshrn_n_s32(q12s32, 14);

+    d11s16 = vqrshrn_n_s32(q5s32, 14);

+    d10s16 = vqrshrn_n_s32(q4s32, 14);

+    q2s16 = vcombine_s16(d4s16, d5s16);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    d30s16 = vdup_n_s16(cospi_6_64);

+    d31s16 = vdup_n_s16(cospi_26_64);

+    q10s32 = vmull_s16(d28s16, d30s16);

+    q11s32 = vmull_s16(d29s16, d30s16);

+    q12s32 = vmull_s16(d28s16, d31s16);

+    q13s32 = vmull_s16(d29s16, d31s16);

+    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);

+    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);

+    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);

+    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);

+    d6s16 = vqrshrn_n_s32(q10s32, 14);

+    d7s16 = vqrshrn_n_s32(q11s32, 14);

+    d8s16 = vqrshrn_n_s32(q12s32, 14);

+    d9s16 = vqrshrn_n_s32(q13s32, 14);

+    q3s16 = vcombine_s16(d6s16, d7s16);

+    q4s16 = vcombine_s16(d8s16, d9s16);

+    // stage 3

+    q9s16  = vsubq_s16(q0s16, q1s16);

+    q0s16  = vaddq_s16(q0s16, q1s16);

+    q10s16 = vsubq_s16(q3s16, q2s16);

+    q11s16 = vaddq_s16(q2s16, q3s16);

+    q12s16 = vaddq_s16(q4s16, q5s16);

+    q13s16 = vsubq_s16(q4s16, q5s16);

+    q14s16 = vsubq_s16(q7s16, q6s16);

+    q7s16  = vaddq_s16(q6s16, q7s16);

+    // stage 4

+    d18s16 = vget_low_s16(q9s16);

+    d19s16 = vget_high_s16(q9s16);

+    d20s16 = vget_low_s16(q10s16);

+    d21s16 = vget_high_s16(q10s16);

+    d26s16 = vget_low_s16(q13s16);

+    d27s16 = vget_high_s16(q13s16);

+    d28s16 = vget_low_s16(q14s16);

+    d29s16 = vget_high_s16(q14s16);

+    d30s16 = vdup_n_s16(cospi_8_64);

+    d31s16 = vdup_n_s16(cospi_24_64);

+    q2s32 = vmull_s16(d18s16, d31s16);

+    q3s32 = vmull_s16(d19s16, d31s16);

+    q4s32 = vmull_s16(d28s16, d31s16);

+    q5s32 = vmull_s16(d29s16, d31s16);

+    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);

+    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);

+    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);

+    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);

+    d12s16 = vqrshrn_n_s32(q2s32, 14);

+    d13s16 = vqrshrn_n_s32(q3s32, 14);

+    d2s16 = vqrshrn_n_s32(q4s32, 14);

+    d3s16 = vqrshrn_n_s32(q5s32, 14);

+    q1s16 = vcombine_s16(d2s16, d3s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    q3s16 = q11s16;

+    q4s16 = q12s16;

+    d30s16 = vdup_n_s16(-cospi_8_64);

+    q11s32 = vmull_s16(d26s16, d30s16);

+    q12s32 = vmull_s16(d27s16, d30s16);

+    q8s32 = vmull_s16(d20s16, d30s16);

+    q9s32 = vmull_s16(d21s16, d30s16);

+    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);

+    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);

+    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);

+    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);

+    d4s16 = vqrshrn_n_s32(q11s32, 14);

+    d5s16 = vqrshrn_n_s32(q12s32, 14);

+    d10s16 = vqrshrn_n_s32(q8s32, 14);

+    d11s16 = vqrshrn_n_s32(q9s32, 14);

+    q2s16 = vcombine_s16(d4s16, d5s16);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    // stage 5

+    q8s16  = vaddq_s16(q0s16, q3s16);

+    q9s16  = vaddq_s16(q1s16, q2s16);

+    q10s16 = vsubq_s16(q1s16, q2s16);

+    q11s16 = vsubq_s16(q0s16, q3s16);

+    q12s16 = vsubq_s16(q7s16, q4s16);

+    q13s16 = vsubq_s16(q6s16, q5s16);

+    q14s16 = vaddq_s16(q6s16, q5s16);

+    q15s16 = vaddq_s16(q7s16, q4s16);

+    // stage 6

+    d20s16 = vget_low_s16(q10s16);

+    d21s16 = vget_high_s16(q10s16);

+    d22s16 = vget_low_s16(q11s16);

+    d23s16 = vget_high_s16(q11s16);

+    d24s16 = vget_low_s16(q12s16);

+    d25s16 = vget_high_s16(q12s16);

+    d26s16 = vget_low_s16(q13s16);

+    d27s16 = vget_high_s16(q13s16);

+    d14s16 = vdup_n_s16(cospi_16_64);

+    q3s32 = vmull_s16(d26s16, d14s16);

+    q4s32 = vmull_s16(d27s16, d14s16);

+    q0s32 = vmull_s16(d20s16, d14s16);

+    q1s32 = vmull_s16(d21s16, d14s16);

+    q5s32 = vsubq_s32(q3s32, q0s32);

+    q6s32 = vsubq_s32(q4s32, q1s32);

+    q10s32 = vaddq_s32(q3s32, q0s32);

+    q4s32 = vaddq_s32(q4s32, q1s32);

+    d4s16 = vqrshrn_n_s32(q5s32, 14);

+    d5s16 = vqrshrn_n_s32(q6s32, 14);

+    d10s16 = vqrshrn_n_s32(q10s32, 14);

+    d11s16 = vqrshrn_n_s32(q4s32, 14);

+    q2s16 = vcombine_s16(d4s16, d5s16);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q0s32 = vmull_s16(d22s16, d14s16);

+    q1s32 = vmull_s16(d23s16, d14s16);

+    q13s32 = vmull_s16(d24s16, d14s16);

+    q6s32 = vmull_s16(d25s16, d14s16);

+    q10s32 = vsubq_s32(q13s32, q0s32);

+    q4s32 = vsubq_s32(q6s32, q1s32);

+    q13s32 = vaddq_s32(q13s32, q0s32);

+    q6s32 = vaddq_s32(q6s32, q1s32);

+    d6s16 = vqrshrn_n_s32(q10s32, 14);

+    d7s16 = vqrshrn_n_s32(q4s32, 14);

+    d8s16 = vqrshrn_n_s32(q13s32, 14);

+    d9s16 = vqrshrn_n_s32(q6s32, 14);

+    q3s16 = vcombine_s16(d6s16, d7s16);

+    q4s16 = vcombine_s16(d8s16, d9s16);

+    // stage 7

+    if (skip_adding != 0) {

+        d = dest;

+        // load the data in pass1

+        q0s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q1s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        d13s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q12s16 = vaddq_s16(q0s16, q15s16);

+        q13s16 = vaddq_s16(q1s16, q14s16);

+        q12s16 = vrshrq_n_s16(q12s16, 6);

+        q13s16 = vrshrq_n_s16(q13s16, 6);

+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

+                          vreinterpret_u8_s64(d12s64));

+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

+                          vreinterpret_u8_s64(d13s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

+        d += dest_stride;

+        q14s16 = vsubq_s16(q1s16, q14s16);

+        q15s16 = vsubq_s16(q0s16, q15s16);

+        q10s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q11s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        d13s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q12s16 = vaddq_s16(q10s16, q5s16);

+        q13s16 = vaddq_s16(q11s16, q4s16);

+        q12s16 = vrshrq_n_s16(q12s16, 6);

+        q13s16 = vrshrq_n_s16(q13s16, 6);

+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

+                          vreinterpret_u8_s64(d12s64));

+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

+                          vreinterpret_u8_s64(d13s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

+        d += dest_stride;

+        q4s16 = vsubq_s16(q11s16, q4s16);

+        q5s16 = vsubq_s16(q10s16, q5s16);

+        q0s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q1s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        d13s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q12s16 = vaddq_s16(q0s16, q3s16);

+        q13s16 = vaddq_s16(q1s16, q2s16);

+        q12s16 = vrshrq_n_s16(q12s16, 6);

+        q13s16 = vrshrq_n_s16(q13s16, 6);

+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

+                          vreinterpret_u8_s64(d12s64));

+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

+                          vreinterpret_u8_s64(d13s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

+        d += dest_stride;

+        q2s16 = vsubq_s16(q1s16, q2s16);

+        q3s16 = vsubq_s16(q0s16, q3s16);

+        q10s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q11s16 = vld1q_s16(pass1Output);

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        d13s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q12s16 = vaddq_s16(q10s16, q9s16);

+        q13s16 = vaddq_s16(q11s16, q8s16);

+        q12s16 = vrshrq_n_s16(q12s16, 6);

+        q13s16 = vrshrq_n_s16(q13s16, 6);

+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),

+                          vreinterpret_u8_s64(d12s64));

+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),

+                          vreinterpret_u8_s64(d13s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));

+        d += dest_stride;

+        q8s16 = vsubq_s16(q11s16, q8s16);

+        q9s16 = vsubq_s16(q10s16, q9s16);

+        // store the data  out 8,9,10,11,12,13,14,15

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q8s16 = vrshrq_n_s16(q8s16, 6);

+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

+                         vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q9s16 = vrshrq_n_s16(q9s16, 6);

+        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

+                          vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q2s16 = vrshrq_n_s16(q2s16, 6);

+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),

+                          vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q3s16 = vrshrq_n_s16(q3s16, 6);

+        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),

+                         vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q4s16 = vrshrq_n_s16(q4s16, 6);

+        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),

+                         vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q5s16 = vrshrq_n_s16(q5s16, 6);

+        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),

+                         vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        d12s64 = vld1_s64((int64_t *)dest);

+        dest += dest_stride;

+        q14s16 = vrshrq_n_s16(q14s16, 6);

+        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),

+                          vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+        d += dest_stride;

+        d12s64 = vld1_s64((int64_t *)dest);

+        q15s16 = vrshrq_n_s16(q15s16, 6);

+        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),

+                          vreinterpret_u8_s64(d12s64));

+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));

+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));

+    } else {  // skip_adding_dest

+        q0s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q1s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q12s16 = vaddq_s16(q0s16, q15s16);

+        q13s16 = vaddq_s16(q1s16, q14s16);

+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+        vst1_u64((uint64_t *)out, d24u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d25u64);

+        out += 12;

+        vst1_u64((uint64_t *)out, d26u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d27u64);

+        out += 12;

+        q14s16 = vsubq_s16(q1s16, q14s16);

+        q15s16 = vsubq_s16(q0s16, q15s16);

+        q10s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q11s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q12s16 = vaddq_s16(q10s16, q5s16);

+        q13s16 = vaddq_s16(q11s16, q4s16);

+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+        vst1_u64((uint64_t *)out, d24u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d25u64);

+        out += 12;

+        vst1_u64((uint64_t *)out, d26u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d27u64);

+        out += 12;

+        q4s16 = vsubq_s16(q11s16, q4s16);

+        q5s16 = vsubq_s16(q10s16, q5s16);

+        q0s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q1s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q12s16 = vaddq_s16(q0s16, q3s16);

+        q13s16 = vaddq_s16(q1s16, q2s16);

+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+        vst1_u64((uint64_t *)out, d24u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d25u64);

+        out += 12;

+        vst1_u64((uint64_t *)out, d26u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d27u64);

+        out += 12;

+        q2s16 = vsubq_s16(q1s16, q2s16);

+        q3s16 = vsubq_s16(q0s16, q3s16);

+        q10s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q11s16 = vld1q_s16(pass1Output);

+        pass1Output += 8;

+        q12s16 = vaddq_s16(q10s16, q9s16);

+        q13s16 = vaddq_s16(q11s16, q8s16);

+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+        vst1_u64((uint64_t *)out, d24u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d25u64);

+        out += 12;

+        vst1_u64((uint64_t *)out, d26u64);

+        out += 4;

+        vst1_u64((uint64_t *)out, d27u64);

+        out += 12;

+        q8s16 = vsubq_s16(q11s16, q8s16);

+        q9s16 = vsubq_s16(q10s16, q9s16);

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));

+        out += 12;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));

+        out += 12;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));

+        out += 12;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));

+        out += 12;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));

+        out += 12;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));

+        out += 12;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));

+        out += 12;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));

+        out += 4;

+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));

+    }

+    return;

+}

+void vp9_idct16x16_10_add_neon_pass1(

+        int16_t *in,

+        int16_t *out,

+        int output_stride) {

+    int16x4_t d4s16;

+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

+    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;

+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;

+    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    int32x4_t q6s32, q9s32;

+    int32x4_t q10s32, q11s32, q12s32, q15s32;

+    int16x8x2_t q0x2s16;

+    q0x2s16 = vld2q_s16(in);

+    q8s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q9s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q10s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q11s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q12s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q13s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q14s16 = q0x2s16.val[0];

+    in += 16;

+    q0x2s16 = vld2q_s16(in);

+    q15s16 = q0x2s16.val[0];

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    // stage 3

+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);

+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);

+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);

+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);

+    // stage 4

+    q1s16 = vdupq_n_s16(cospi_16_64 * 2);

+    d4s16 = vdup_n_s16(cospi_16_64);

+    q8s16 = vqrdmulhq_s16(q8s16, q1s16);

+    d8s16 = vget_low_s16(q4s16);

+    d9s16 = vget_high_s16(q4s16);

+    d14s16 = vget_low_s16(q7s16);

+    d15s16 = vget_high_s16(q7s16);

+    q9s32  = vmull_s16(d14s16, d4s16);

+    q10s32 = vmull_s16(d15s16, d4s16);

+    q12s32 = vmull_s16(d9s16, d4s16);

+    q11s32 = vmull_s16(d8s16, d4s16);

+    q15s32 = vsubq_s32(q10s32, q12s32);

+    q6s32 = vsubq_s32(q9s32, q11s32);

+    q9s32 = vaddq_s32(q9s32, q11s32);

+    q10s32 = vaddq_s32(q10s32, q12s32);

+    d11s16 = vqrshrn_n_s32(q15s32, 14);

+    d10s16 = vqrshrn_n_s32(q6s32, 14);

+    d12s16 = vqrshrn_n_s32(q9s32, 14);

+    d13s16 = vqrshrn_n_s32(q10s32, 14);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    // stage 6

+    q2s16 = vaddq_s16(q8s16, q7s16);

+    q9s16 = vaddq_s16(q8s16, q6s16);

+    q10s16 = vaddq_s16(q8s16, q5s16);

+    q11s16 = vaddq_s16(q8s16, q4s16);

+    q12s16 = vsubq_s16(q8s16, q4s16);

+    q13s16 = vsubq_s16(q8s16, q5s16);

+    q14s16 = vsubq_s16(q8s16, q6s16);

+    q15s16 = vsubq_s16(q8s16, q7s16);

+    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));

+    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));

+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));

+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));

+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));

+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));

+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));

+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));

+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));

+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));

+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));

+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));

+    // store the data

+    output_stride >>= 1;  // output_stride / 2, out is int16_t

+    vst1_u64((uint64_t *)out, d4u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d5u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d18u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d19u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d20u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d21u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d22u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d23u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d24u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d25u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d26u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d27u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d28u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d29u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d30u64);

+    out += output_stride;

+    vst1_u64((uint64_t *)out, d31u64);

+    return;

+}

+void vp9_idct16x16_10_add_neon_pass2(

+        int16_t *src,

+        int16_t *out,

+        int16_t *pass1Output,

+        int16_t skip_adding,

+        uint8_t *dest,

+        int dest_stride) {

+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;

+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

+    int16x4_t d20s16, d21s16, d22s16, d23s16;

+    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;

+    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;

+    uint64x1_t d16u64, d17u64, d18u64, d19u64;

+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;

+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;

+    int32x4_t q10s32, q11s32, q12s32, q13s32;

+    int16x8x2_t q0x2s16;

+    (void)skip_adding;

+    (void)dest;

+    (void)dest_stride;

+    q0x2s16 = vld2q_s16(src);

+    q8s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q9s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q10s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q11s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q12s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q13s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q14s16 = q0x2s16.val[0];

+    src += 16;

+    q0x2s16 = vld2q_s16(src);

+    q15s16 = q0x2s16.val[0];

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    // stage 3

+    q6s16 = vdupq_n_s16(cospi_30_64 * 2);

+    q0s16 = vqrdmulhq_s16(q8s16, q6s16);

+    q6s16 = vdupq_n_s16(cospi_2_64 * 2);

+    q7s16 = vqrdmulhq_s16(q8s16, q6s16);

+    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);

+    q14s16 = vdupq_n_s16(cospi_6_64 * 2);

+    q3s16 = vqrdmulhq_s16(q9s16, q15s16);

+    q4s16 = vqrdmulhq_s16(q9s16, q14s16);

+    // stage 4

+    d0s16 = vget_low_s16(q0s16);

+    d1s16 = vget_high_s16(q0s16);

+    d6s16 = vget_low_s16(q3s16);

+    d7s16 = vget_high_s16(q3s16);

+    d8s16 = vget_low_s16(q4s16);

+    d9s16 = vget_high_s16(q4s16);

+    d14s16 = vget_low_s16(q7s16);

+    d15s16 = vget_high_s16(q7s16);

+    d30s16 = vdup_n_s16(cospi_8_64);

+    d31s16 = vdup_n_s16(cospi_24_64);

+    q12s32 = vmull_s16(d14s16, d31s16);

+    q5s32 = vmull_s16(d15s16, d31s16);

+    q2s32 = vmull_s16(d0s16, d31s16);

+    q11s32 = vmull_s16(d1s16, d31s16);

+    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);

+    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);

+    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);

+    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);

+    d2s16 = vqrshrn_n_s32(q12s32, 14);

+    d3s16 = vqrshrn_n_s32(q5s32, 14);

+    d12s16 = vqrshrn_n_s32(q2s32, 14);

+    d13s16 = vqrshrn_n_s32(q11s32, 14);

+    q1s16 = vcombine_s16(d2s16, d3s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    d30s16 = vdup_n_s16(-cospi_8_64);

+    q10s32 = vmull_s16(d8s16, d30s16);

+    q13s32 = vmull_s16(d9s16, d30s16);

+    q8s32 = vmull_s16(d6s16, d30s16);

+    q9s32 = vmull_s16(d7s16, d30s16);

+    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);

+    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);

+    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);

+    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);

+    d4s16 = vqrshrn_n_s32(q10s32, 14);

+    d5s16 = vqrshrn_n_s32(q13s32, 14);

+    d10s16 = vqrshrn_n_s32(q8s32, 14);

+    d11s16 = vqrshrn_n_s32(q9s32, 14);

+    q2s16 = vcombine_s16(d4s16, d5s16);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    // stage 5

+    q8s16  = vaddq_s16(q0s16, q3s16);

+    q9s16  = vaddq_s16(q1s16, q2s16);

+    q10s16 = vsubq_s16(q1s16, q2s16);

+    q11s16 = vsubq_s16(q0s16, q3s16);

+    q12s16 = vsubq_s16(q7s16, q4s16);

+    q13s16 = vsubq_s16(q6s16, q5s16);

+    q14s16 = vaddq_s16(q6s16, q5s16);

+    q15s16 = vaddq_s16(q7s16, q4s16);

+    // stage 6

+    d20s16 = vget_low_s16(q10s16);

+    d21s16 = vget_high_s16(q10s16);

+    d22s16 = vget_low_s16(q11s16);

+    d23s16 = vget_high_s16(q11s16);

+    d24s16 = vget_low_s16(q12s16);

+    d25s16 = vget_high_s16(q12s16);

+    d26s16 = vget_low_s16(q13s16);

+    d27s16 = vget_high_s16(q13s16);

+    d14s16 = vdup_n_s16(cospi_16_64);

+    q3s32 = vmull_s16(d26s16, d14s16);

+    q4s32 = vmull_s16(d27s16, d14s16);

+    q0s32 = vmull_s16(d20s16, d14s16);

+    q1s32 = vmull_s16(d21s16, d14s16);

+    q5s32 = vsubq_s32(q3s32, q0s32);

+    q6s32 = vsubq_s32(q4s32, q1s32);

+    q0s32 = vaddq_s32(q3s32, q0s32);

+    q4s32 = vaddq_s32(q4s32, q1s32);

+    d4s16 = vqrshrn_n_s32(q5s32, 14);

+    d5s16 = vqrshrn_n_s32(q6s32, 14);

+    d10s16 = vqrshrn_n_s32(q0s32, 14);

+    d11s16 = vqrshrn_n_s32(q4s32, 14);

+    q2s16 = vcombine_s16(d4s16, d5s16);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q0s32 = vmull_s16(d22s16, d14s16);

+    q1s32 = vmull_s16(d23s16, d14s16);

+    q13s32 = vmull_s16(d24s16, d14s16);

+    q6s32 = vmull_s16(d25s16, d14s16);

+    q10s32 = vsubq_s32(q13s32, q0s32);

+    q4s32 = vsubq_s32(q6s32, q1s32);

+    q13s32 = vaddq_s32(q13s32, q0s32);

+    q6s32 = vaddq_s32(q6s32, q1s32);

+    d6s16 = vqrshrn_n_s32(q10s32, 14);

+    d7s16 = vqrshrn_n_s32(q4s32, 14);

+    d8s16 = vqrshrn_n_s32(q13s32, 14);

+    d9s16 = vqrshrn_n_s32(q6s32, 14);

+    q3s16 = vcombine_s16(d6s16, d7s16);

+    q4s16 = vcombine_s16(d8s16, d9s16);

+    // stage 7

+    q0s16 = vld1q_s16(pass1Output);

+    pass1Output += 8;

+    q1s16 = vld1q_s16(pass1Output);

+    pass1Output += 8;

+    q12s16 = vaddq_s16(q0s16, q15s16);

+    q13s16 = vaddq_s16(q1s16, q14s16);

+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+    vst1_u64((uint64_t *)out, d24u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d25u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d26u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d27u64);

+    out += 12;

+    q14s16 = vsubq_s16(q1s16, q14s16);

+    q15s16 = vsubq_s16(q0s16, q15s16);

+    q10s16 = vld1q_s16(pass1Output);

+    pass1Output += 8;

+    q11s16 = vld1q_s16(pass1Output);

+    pass1Output += 8;

+    q12s16 = vaddq_s16(q10s16, q5s16);

+    q13s16 = vaddq_s16(q11s16, q4s16);

+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+    vst1_u64((uint64_t *)out, d24u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d25u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d26u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d27u64);

+    out += 12;

+    q4s16 = vsubq_s16(q11s16, q4s16);

+    q5s16 = vsubq_s16(q10s16, q5s16);

+    q0s16 = vld1q_s16(pass1Output);

+    pass1Output += 8;

+    q1s16 = vld1q_s16(pass1Output);

+    pass1Output += 8;

+    q12s16 = vaddq_s16(q0s16, q3s16);

+    q13s16 = vaddq_s16(q1s16, q2s16);

+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+    vst1_u64((uint64_t *)out, d24u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d25u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d26u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d27u64);

+    out += 12;

+    q2s16 = vsubq_s16(q1s16, q2s16);

+    q3s16 = vsubq_s16(q0s16, q3s16);

+    q10s16 = vld1q_s16(pass1Output);

+    pass1Output += 8;

+    q11s16 = vld1q_s16(pass1Output);

+    q12s16 = vaddq_s16(q10s16, q9s16);

+    q13s16 = vaddq_s16(q11s16, q8s16);

+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));

+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));

+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));

+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));

+    vst1_u64((uint64_t *)out, d24u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d25u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d26u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d27u64);

+    out += 12;

+    q8s16 = vsubq_s16(q11s16, q8s16);

+    q9s16 = vsubq_s16(q10s16, q9s16);

+    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));

+    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));

+    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));

+    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));

+    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));

+    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));

+    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));

+    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));

+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));

+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));

+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));

+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));

+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));

+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));

+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));

+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));

+    vst1_u64((uint64_t *)out, d16u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d17u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d18u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d19u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d4u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d5u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d6u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d7u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d8u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d9u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d10u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d11u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d28u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d29u64);

+    out += 12;

+    vst1_u64((uint64_t *)out, d30u64);

+    out += 4;

+    vst1_u64((uint64_t *)out, d31u64);

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct16x16_neon.c

@@ -1,0 +1,185 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/vp9_common.h"

+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,

+                                      int16_t *output,

+                                      int output_stride);

+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,

+                                      int16_t *output,

+                                      int16_t *pass1Output,

+                                      int16_t skip_adding,

+                                      uint8_t *dest,

+                                      int dest_stride);

+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,

+                                     int16_t *output,

+                                     int output_stride);

+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,

+                                     int16_t *output,

+                                     int16_t *pass1Output,

+                                     int16_t skip_adding,

+                                     uint8_t *dest,

+                                     int dest_stride);

+#if HAVE_NEON_ASM

+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */

+extern void vp9_push_neon(int64_t *store);

+extern void vp9_pop_neon(int64_t *store);

+#endif  // HAVE_NEON_ASM

+void vp9_idct16x16_256_add_neon(const int16_t *input,

+                                uint8_t *dest, int dest_stride) {

+#if HAVE_NEON_ASM

+  int64_t store_reg[8];

+#endif

+  int16_t pass1_output[16*16] = {0};

+  int16_t row_idct_output[16*16] = {0};

+#if HAVE_NEON_ASM

+  // save d8-d15 register values.

+  vp9_push_neon(store_reg);

+#endif

+  /* Parallel idct on the upper 8 rows */

+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

+  // stage 6 result in pass1_output.

+  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);

+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

+  // with result in pass1(pass1_output) to calculate final result in stage 7

+  // which will be saved into row_idct_output.

+  vp9_idct16x16_256_add_neon_pass2(input+1,

+                                     row_idct_output,

+                                     pass1_output,

+                                     0,

+                                     dest,

+                                     dest_stride);

+  /* Parallel idct on the lower 8 rows */

+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

+  // stage 6 result in pass1_output.

+  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);

+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

+  // with result in pass1(pass1_output) to calculate final result in stage 7

+  // which will be saved into row_idct_output.

+  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,

+                                     row_idct_output+8,

+                                     pass1_output,

+                                     0,

+                                     dest,

+                                     dest_stride);

+  /* Parallel idct on the left 8 columns */

+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

+  // stage 6 result in pass1_output.

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

+  // with result in pass1(pass1_output) to calculate final result in stage 7.

+  // Then add the result to the destination data.

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,

+                                     row_idct_output,

+                                     pass1_output,

+                                     1,

+                                     dest,

+                                     dest_stride);

+  /* Parallel idct on the right 8 columns */

+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

+  // stage 6 result in pass1_output.

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

+  // with result in pass1(pass1_output) to calculate final result in stage 7.

+  // Then add the result to the destination data.

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

+                                     row_idct_output+8,

+                                     pass1_output,

+                                     1,

+                                     dest+8,

+                                     dest_stride);

+#if HAVE_NEON_ASM

+  // restore d8-d15 register values.

+  vp9_pop_neon(store_reg);

+#endif

+  return;

+}

+void vp9_idct16x16_10_add_neon(const int16_t *input,

+                               uint8_t *dest, int dest_stride) {

+#if HAVE_NEON_ASM

+  int64_t store_reg[8];

+#endif

+  int16_t pass1_output[16*16] = {0};

+  int16_t row_idct_output[16*16] = {0};

+#if HAVE_NEON_ASM

+  // save d8-d15 register values.

+  vp9_push_neon(store_reg);

+#endif

+  /* Parallel idct on the upper 8 rows */

+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

+  // stage 6 result in pass1_output.

+  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);

+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

+  // with result in pass1(pass1_output) to calculate final result in stage 7

+  // which will be saved into row_idct_output.

+  vp9_idct16x16_10_add_neon_pass2(input+1,

+                                        row_idct_output,

+                                        pass1_output,

+                                        0,

+                                        dest,

+                                        dest_stride);

+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */

+  /* Parallel idct on the left 8 columns */

+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

+  // stage 6 result in pass1_output.

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

+  // with result in pass1(pass1_output) to calculate final result in stage 7.

+  // Then add the result to the destination data.

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,

+                                     row_idct_output,

+                                     pass1_output,

+                                     1,

+                                     dest,

+                                     dest_stride);

+  /* Parallel idct on the right 8 columns */

+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

+  // stage 6 result in pass1_output.

+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

+  // with result in pass1(pass1_output) to calculate final result in stage 7.

+  // Then add the result to the destination data.

+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

+                                     row_idct_output+8,

+                                     pass1_output,

+                                     1,

+                                     dest+8,

+                                     dest_stride);

+#if HAVE_NEON_ASM

+  // restore d8-d15 register values.

+  vp9_pop_neon(store_reg);

+#endif

+  return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct32x32_1_add_neon.asm

@@ -1,0 +1,144 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |vp9_idct32x32_1_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+    ;TODO(hkuang): put the following macros in a seperate

+    ;file so other idct function could also use them.

+    MACRO

+    LD_16x8          $src, $stride

+    vld1.8           {q8}, [$src], $stride

+    vld1.8           {q9}, [$src], $stride

+    vld1.8           {q10}, [$src], $stride

+    vld1.8           {q11}, [$src], $stride

+    vld1.8           {q12}, [$src], $stride

+    vld1.8           {q13}, [$src], $stride

+    vld1.8           {q14}, [$src], $stride

+    vld1.8           {q15}, [$src], $stride

+    MEND

+    MACRO

+    ADD_DIFF_16x8    $diff

+    vqadd.u8         q8, q8, $diff

+    vqadd.u8         q9, q9, $diff

+    vqadd.u8         q10, q10, $diff

+    vqadd.u8         q11, q11, $diff

+    vqadd.u8         q12, q12, $diff

+    vqadd.u8         q13, q13, $diff

+    vqadd.u8         q14, q14, $diff

+    vqadd.u8         q15, q15, $diff

+    MEND

+    MACRO

+    SUB_DIFF_16x8    $diff

+    vqsub.u8         q8, q8, $diff

+    vqsub.u8         q9, q9, $diff

+    vqsub.u8         q10, q10, $diff

+    vqsub.u8         q11, q11, $diff

+    vqsub.u8         q12, q12, $diff

+    vqsub.u8         q13, q13, $diff

+    vqsub.u8         q14, q14, $diff

+    vqsub.u8         q15, q15, $diff

+    MEND

+    MACRO

+    ST_16x8          $dst, $stride

+    vst1.8           {q8}, [$dst], $stride

+    vst1.8           {q9}, [$dst], $stride

+    vst1.8           {q10},[$dst], $stride

+    vst1.8           {q11},[$dst], $stride

+    vst1.8           {q12},[$dst], $stride

+    vst1.8           {q13},[$dst], $stride

+    vst1.8           {q14},[$dst], $stride

+    vst1.8           {q15},[$dst], $stride

+    MEND

+;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,

+;                              int dest_stride)

+;

+; r0  int16_t input

+; r1  uint8_t *dest

+; r2  int dest_stride

+|vp9_idct32x32_1_add_neon| PROC

+    push             {lr}

+    pld              [r1]

+    add              r3, r1, #16               ; r3 dest + 16 for second loop

+    ldrsh            r0, [r0]

+    ; generate cospi_16_64 = 11585

+    mov              r12, #0x2d00

+    add              r12, #0x41

+    ; out = dct_const_round_shift(input[0] * cospi_16_64)

+    mul              r0, r0, r12               ; input[0] * cospi_16_64

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; out = dct_const_round_shift(out * cospi_16_64)

+    mul              r0, r0, r12               ; out * cospi_16_64

+    mov              r12, r1                   ; save dest

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; a1 = ROUND_POWER_OF_TWO(out, 6)

+    add              r0, r0, #32               ; + (1 <<((6) - 1))

+    asrs             r0, r0, #6                ; >> 6

+    bge              diff_positive_32_32

+diff_negative_32_32

+    neg              r0, r0

+    usat             r0, #8, r0

+    vdup.u8          q0, r0

+    mov              r0, #4

+diff_negative_32_32_loop

+    sub              r0, #1

+    LD_16x8          r1, r2

+    SUB_DIFF_16x8    q0

+    ST_16x8          r12, r2

+    LD_16x8          r1, r2

+    SUB_DIFF_16x8    q0

+    ST_16x8          r12, r2

+    cmp              r0, #2

+    moveq            r1, r3

+    moveq            r12, r3

+    cmp              r0, #0

+    bne              diff_negative_32_32_loop

+    pop              {pc}

+diff_positive_32_32

+    usat             r0, #8, r0

+    vdup.u8          q0, r0

+    mov              r0, #4

+diff_positive_32_32_loop

+    sub              r0, #1

+    LD_16x8          r1, r2

+    ADD_DIFF_16x8    q0

+    ST_16x8          r12, r2

+    LD_16x8          r1, r2

+    ADD_DIFF_16x8    q0

+    ST_16x8          r12, r2

+    cmp              r0, #2

+    moveq            r1, r3

+    moveq            r12, r3

+    cmp              r0, #0

+    bne              diff_positive_32_32_loop

+    pop              {pc}

+    ENDP             ; |vp9_idct32x32_1_add_neon|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct32x32_1_add_neon.c

@@ -1,0 +1,165 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_config.h"

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_ports/mem.h"

+static INLINE void LD_16x8(

+        uint8_t *d,

+        int d_stride,

+        uint8x16_t *q8u8,

+        uint8x16_t *q9u8,

+        uint8x16_t *q10u8,

+        uint8x16_t *q11u8,

+        uint8x16_t *q12u8,

+        uint8x16_t *q13u8,

+        uint8x16_t *q14u8,

+        uint8x16_t *q15u8) {

+    *q8u8 = vld1q_u8(d);

+    d += d_stride;

+    *q9u8 = vld1q_u8(d);

+    d += d_stride;

+    *q10u8 = vld1q_u8(d);

+    d += d_stride;

+    *q11u8 = vld1q_u8(d);

+    d += d_stride;

+    *q12u8 = vld1q_u8(d);

+    d += d_stride;

+    *q13u8 = vld1q_u8(d);

+    d += d_stride;

+    *q14u8 = vld1q_u8(d);

+    d += d_stride;

+    *q15u8 = vld1q_u8(d);

+    return;

+}

+static INLINE void ADD_DIFF_16x8(

+        uint8x16_t qdiffu8,

+        uint8x16_t *q8u8,

+        uint8x16_t *q9u8,

+        uint8x16_t *q10u8,

+        uint8x16_t *q11u8,

+        uint8x16_t *q12u8,

+        uint8x16_t *q13u8,

+        uint8x16_t *q14u8,

+        uint8x16_t *q15u8) {

+    *q8u8 = vqaddq_u8(*q8u8, qdiffu8);

+    *q9u8 = vqaddq_u8(*q9u8, qdiffu8);

+    *q10u8 = vqaddq_u8(*q10u8, qdiffu8);

+    *q11u8 = vqaddq_u8(*q11u8, qdiffu8);

+    *q12u8 = vqaddq_u8(*q12u8, qdiffu8);

+    *q13u8 = vqaddq_u8(*q13u8, qdiffu8);

+    *q14u8 = vqaddq_u8(*q14u8, qdiffu8);

+    *q15u8 = vqaddq_u8(*q15u8, qdiffu8);

+    return;

+}

+static INLINE void SUB_DIFF_16x8(

+        uint8x16_t qdiffu8,

+        uint8x16_t *q8u8,

+        uint8x16_t *q9u8,

+        uint8x16_t *q10u8,

+        uint8x16_t *q11u8,

+        uint8x16_t *q12u8,

+        uint8x16_t *q13u8,

+        uint8x16_t *q14u8,

+        uint8x16_t *q15u8) {

+    *q8u8 = vqsubq_u8(*q8u8, qdiffu8);

+    *q9u8 = vqsubq_u8(*q9u8, qdiffu8);

+    *q10u8 = vqsubq_u8(*q10u8, qdiffu8);

+    *q11u8 = vqsubq_u8(*q11u8, qdiffu8);

+    *q12u8 = vqsubq_u8(*q12u8, qdiffu8);

+    *q13u8 = vqsubq_u8(*q13u8, qdiffu8);

+    *q14u8 = vqsubq_u8(*q14u8, qdiffu8);

+    *q15u8 = vqsubq_u8(*q15u8, qdiffu8);

+    return;

+}

+static INLINE void ST_16x8(

+        uint8_t *d,

+        int d_stride,

+        uint8x16_t *q8u8,

+        uint8x16_t *q9u8,

+        uint8x16_t *q10u8,

+        uint8x16_t *q11u8,

+        uint8x16_t *q12u8,

+        uint8x16_t *q13u8,

+        uint8x16_t *q14u8,

+        uint8x16_t *q15u8) {

+    vst1q_u8(d, *q8u8);

+    d += d_stride;

+    vst1q_u8(d, *q9u8);

+    d += d_stride;

+    vst1q_u8(d, *q10u8);

+    d += d_stride;

+    vst1q_u8(d, *q11u8);

+    d += d_stride;

+    vst1q_u8(d, *q12u8);

+    d += d_stride;

+    vst1q_u8(d, *q13u8);

+    d += d_stride;

+    vst1q_u8(d, *q14u8);

+    d += d_stride;

+    vst1q_u8(d, *q15u8);

+    return;

+}

+void vp9_idct32x32_1_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;

+    int i, j, dest_stride8;

+    uint8_t *d;

+    int16_t a1, cospi_16_64 = 11585;

+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+    out = dct_const_round_shift(out * cospi_16_64);

+    a1 = ROUND_POWER_OF_TWO(out, 6);

+    dest_stride8 = dest_stride * 8;

+    if (a1 >= 0) {  // diff_positive_32_32

+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;

+        q0u8 = vdupq_n_u8(a1);

+        for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop

+            d = dest;

+            for (j = 0; j < 4; j++) {

+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

+                                        &q12u8, &q13u8, &q14u8, &q15u8);

+                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,

+                                    &q12u8, &q13u8, &q14u8, &q15u8);

+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

+                                        &q12u8, &q13u8, &q14u8, &q15u8);

+                d += dest_stride8;

+            }

+        }

+    } else {  // diff_negative_32_32

+        a1 = -a1;

+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;

+        q0u8 = vdupq_n_u8(a1);

+        for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop

+            d = dest;

+            for (j = 0; j < 4; j++) {

+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

+                                        &q12u8, &q13u8, &q14u8, &q15u8);

+                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,

+                                    &q12u8, &q13u8, &q14u8, &q15u8);

+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,

+                                        &q12u8, &q13u8, &q14u8, &q15u8);

+                d += dest_stride8;

+            }

+        }

+    }

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct32x32_add_neon.asm

@@ -1,0 +1,1299 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+;TODO(cd): adjust these constant to be able to use vqdmulh for faster

+;          dct_const_round_shift(a * b) within butterfly calculations.

+cospi_1_64  EQU 16364

+cospi_2_64  EQU 16305

+cospi_3_64  EQU 16207

+cospi_4_64  EQU 16069

+cospi_5_64  EQU 15893

+cospi_6_64  EQU 15679

+cospi_7_64  EQU 15426

+cospi_8_64  EQU 15137

+cospi_9_64  EQU 14811

+cospi_10_64 EQU 14449

+cospi_11_64 EQU 14053

+cospi_12_64 EQU 13623

+cospi_13_64 EQU 13160

+cospi_14_64 EQU 12665

+cospi_15_64 EQU 12140

+cospi_16_64 EQU 11585

+cospi_17_64 EQU 11003

+cospi_18_64 EQU 10394

+cospi_19_64 EQU  9760

+cospi_20_64 EQU  9102

+cospi_21_64 EQU  8423

+cospi_22_64 EQU  7723

+cospi_23_64 EQU  7005

+cospi_24_64 EQU  6270

+cospi_25_64 EQU  5520

+cospi_26_64 EQU  4756

+cospi_27_64 EQU  3981

+cospi_28_64 EQU  3196

+cospi_29_64 EQU  2404

+cospi_30_64 EQU  1606

+cospi_31_64 EQU   804

+    EXPORT  |vp9_idct32x32_1024_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+    AREA     Block, CODE, READONLY

+    ; --------------------------------------------------------------------------

+    ; Load from transposed_buffer

+    ;   q13 = transposed_buffer[first_offset]

+    ;   q14 = transposed_buffer[second_offset]

+    ;   for proper address calculation, the last offset used when manipulating

+    ;   transposed_buffer must be passed in. use 0 for first use.

+    MACRO

+    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset

+    ; address calculation with proper stride and loading

+    add r0, #($first_offset  - $prev_offset )*8*2

+    vld1.s16        {q14}, [r0]

+    add r0, #($second_offset - $first_offset)*8*2

+    vld1.s16        {q13}, [r0]

+    ; (used) two registers (q14, q13)

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Load from output (used as temporary storage)

+    ;   reg1 = output[first_offset]

+    ;   reg2 = output[second_offset]

+    ;   for proper address calculation, the last offset used when manipulating

+    ;   output, whether reading or storing) must be passed in. use 0 for first

+    ;   use.

+    MACRO

+    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

+    ; address calculation with proper stride and loading

+    add r1, #($first_offset  - $prev_offset )*32*2

+    vld1.s16        {$reg1}, [r1]

+    add r1, #($second_offset - $first_offset)*32*2

+    vld1.s16        {$reg2}, [r1]

+    ; (used) two registers ($reg1, $reg2)

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Store into output (sometimes as as temporary storage)

+    ;   output[first_offset] = reg1

+    ;   output[second_offset] = reg2

+    ;   for proper address calculation, the last offset used when manipulating

+    ;   output, whether reading or storing) must be passed in. use 0 for first

+    ;   use.

+    MACRO

+    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2

+    ; address calculation with proper stride and storing

+    add r1, #($first_offset  - $prev_offset )*32*2

+    vst1.16 {$reg1}, [r1]

+    add r1, #($second_offset - $first_offset)*32*2

+    vst1.16 {$reg2}, [r1]

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Combine-add results with current destination content

+    ;   q6-q9 contain the results (out[j * 32 + 0-31])

+    MACRO

+    STORE_COMBINE_CENTER_RESULTS

+    ; load dest[j * dest_stride + 0-31]

+    vld1.s16        {d8}, [r10], r2

+    vld1.s16        {d11}, [r9], r11

+    vld1.s16        {d9}, [r10]

+    vld1.s16        {d10}, [r9]

+    ; ROUND_POWER_OF_TWO

+    vrshr.s16       q7, q7, #6

+    vrshr.s16       q8, q8, #6

+    vrshr.s16       q9, q9, #6

+    vrshr.s16       q6, q6, #6

+    ; add to dest[j * dest_stride + 0-31]

+    vaddw.u8        q7, q7, d9

+    vaddw.u8        q8, q8, d10

+    vaddw.u8        q9, q9, d11

+    vaddw.u8        q6, q6, d8

+    ; clip pixel

+    vqmovun.s16     d9,  q7

+    vqmovun.s16     d10, q8

+    vqmovun.s16     d11, q9

+    vqmovun.s16     d8,  q6

+    ; store back into dest[j * dest_stride + 0-31]

+    vst1.16         {d9}, [r10], r11

+    vst1.16         {d10}, [r9], r2

+    vst1.16         {d8}, [r10]

+    vst1.16         {d11}, [r9]

+    ; update pointers (by dest_stride * 2)

+    sub r9,  r9,  r2, lsl #1

+    add r10, r10, r2, lsl #1

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Combine-add results with current destination content

+    ;   q6-q9 contain the results (out[j * 32 + 0-31])

+    MACRO

+    STORE_COMBINE_CENTER_RESULTS_LAST

+    ; load dest[j * dest_stride + 0-31]

+    vld1.s16        {d8}, [r10], r2

+    vld1.s16        {d11}, [r9], r11

+    vld1.s16        {d9}, [r10]

+    vld1.s16        {d10}, [r9]

+    ; ROUND_POWER_OF_TWO

+    vrshr.s16       q7, q7, #6

+    vrshr.s16       q8, q8, #6

+    vrshr.s16       q9, q9, #6

+    vrshr.s16       q6, q6, #6

+    ; add to dest[j * dest_stride + 0-31]

+    vaddw.u8        q7, q7, d9

+    vaddw.u8        q8, q8, d10

+    vaddw.u8        q9, q9, d11

+    vaddw.u8        q6, q6, d8

+    ; clip pixel

+    vqmovun.s16     d9,  q7

+    vqmovun.s16     d10, q8

+    vqmovun.s16     d11, q9

+    vqmovun.s16     d8,  q6

+    ; store back into dest[j * dest_stride + 0-31]

+    vst1.16         {d9}, [r10], r11

+    vst1.16         {d10}, [r9], r2

+    vst1.16         {d8}, [r10]!

+    vst1.16         {d11}, [r9]!

+    ; update pointers (by dest_stride * 2)

+    sub r9,  r9,  r2, lsl #1

+    add r10, r10, r2, lsl #1

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Combine-add results with current destination content

+    ;   q4-q7 contain the results (out[j * 32 + 0-31])

+    MACRO

+    STORE_COMBINE_EXTREME_RESULTS

+    ; load dest[j * dest_stride + 0-31]

+    vld1.s16        {d4}, [r7], r2

+    vld1.s16        {d7}, [r6], r11

+    vld1.s16        {d5}, [r7]

+    vld1.s16        {d6}, [r6]

+    ; ROUND_POWER_OF_TWO

+    vrshr.s16       q5, q5, #6

+    vrshr.s16       q6, q6, #6

+    vrshr.s16       q7, q7, #6

+    vrshr.s16       q4, q4, #6

+    ; add to dest[j * dest_stride + 0-31]

+    vaddw.u8        q5, q5, d5

+    vaddw.u8        q6, q6, d6

+    vaddw.u8        q7, q7, d7

+    vaddw.u8        q4, q4, d4

+    ; clip pixel

+    vqmovun.s16     d5, q5

+    vqmovun.s16     d6, q6

+    vqmovun.s16     d7, q7

+    vqmovun.s16     d4, q4

+    ; store back into dest[j * dest_stride + 0-31]

+    vst1.16         {d5}, [r7], r11

+    vst1.16         {d6}, [r6], r2

+    vst1.16         {d7}, [r6]

+    vst1.16         {d4}, [r7]

+    ; update pointers (by dest_stride * 2)

+    sub r6, r6, r2, lsl #1

+    add r7, r7, r2, lsl #1

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Combine-add results with current destination content

+    ;   q4-q7 contain the results (out[j * 32 + 0-31])

+    MACRO

+    STORE_COMBINE_EXTREME_RESULTS_LAST

+    ; load dest[j * dest_stride + 0-31]

+    vld1.s16        {d4}, [r7], r2

+    vld1.s16        {d7}, [r6], r11

+    vld1.s16        {d5}, [r7]

+    vld1.s16        {d6}, [r6]

+    ; ROUND_POWER_OF_TWO

+    vrshr.s16       q5, q5, #6

+    vrshr.s16       q6, q6, #6

+    vrshr.s16       q7, q7, #6

+    vrshr.s16       q4, q4, #6

+    ; add to dest[j * dest_stride + 0-31]

+    vaddw.u8        q5, q5, d5

+    vaddw.u8        q6, q6, d6

+    vaddw.u8        q7, q7, d7

+    vaddw.u8        q4, q4, d4

+    ; clip pixel

+    vqmovun.s16     d5, q5

+    vqmovun.s16     d6, q6

+    vqmovun.s16     d7, q7

+    vqmovun.s16     d4, q4

+    ; store back into dest[j * dest_stride + 0-31]

+    vst1.16         {d5}, [r7], r11

+    vst1.16         {d6}, [r6], r2

+    vst1.16         {d7}, [r6]!

+    vst1.16         {d4}, [r7]!

+    ; update pointers (by dest_stride * 2)

+    sub r6, r6, r2, lsl #1

+    add r7, r7, r2, lsl #1

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Touches q8-q12, q15 (q13-q14 are preserved)

+    ; valid output registers are anything but q8-q11

+    MACRO

+    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

+    ; TODO(cd): have special case to re-use constants when they are similar for

+    ;           consecutive butterflies

+    ; TODO(cd): have special case when both constants are the same, do the

+    ;           additions/subtractions before the multiplies.

+    ; generate the constants

+    ;   generate scalar constants

+    mov             r8,  #$first_constant  & 0xFF00

+    mov             r12, #$second_constant & 0xFF00

+    add             r8,  #$first_constant  & 0x00FF

+    add             r12, #$second_constant & 0x00FF

+    ;   generate vector constants

+    vdup.16         d30, r8

+    vdup.16         d31, r12

+    ; (used) two for inputs (regA-regD), one for constants (q15)

+    ; do some multiplications (ordered for maximum latency hiding)

+    vmull.s16 q8,  $regC, d30

+    vmull.s16 q10, $regA, d31

+    vmull.s16 q9,  $regD, d30

+    vmull.s16 q11, $regB, d31

+    vmull.s16 q12, $regC, d31

+    ; (used) five for intermediate (q8-q12), one for constants (q15)

+    ; do some addition/subtractions (to get back two register)

+    vsub.s32  q8, q8, q10

+    vsub.s32  q9, q9, q11

+    ; do more multiplications (ordered for maximum latency hiding)

+    vmull.s16 q10, $regD, d31

+    vmull.s16 q11, $regA, d30

+    vmull.s16 q15, $regB, d30

+    ; (used) six for intermediate (q8-q12, q15)

+    ; do more addition/subtractions

+    vadd.s32  q11, q12, q11

+    vadd.s32  q10, q10, q15

+    ; (used) four for intermediate (q8-q11)

+    ; dct_const_round_shift

+    vqrshrn.s32 $reg1, q8,  #14

+    vqrshrn.s32 $reg2, q9,  #14

+    vqrshrn.s32 $reg3, q11, #14

+    vqrshrn.s32 $reg4, q10, #14

+    ; (used) two for results, well four d registers

+    MEND

+    ; --------------------------------------------------------------------------

+    ; Touches q8-q12, q15 (q13-q14 are preserved)

+    ; valid output registers are anything but q8-q11

+    MACRO

+    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

+    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4

+    MEND

+    ; --------------------------------------------------------------------------

+;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);

+;

+;   r0  int16_t *input,

+;   r1  uint8_t *dest,

+;   r2  int dest_stride)

+; loop counters

+;   r4  bands loop counter

+;   r5  pass loop counter

+;   r8  transpose loop counter

+; combine-add pointers

+;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)

+;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)

+;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)

+;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)

+|vp9_idct32x32_1024_add_neon| PROC

+    ; This function does one pass of idct32x32 transform.

+    ;

+    ; This is done by transposing the input and then doing a 1d transform on

+    ; columns. In the first pass, the transposed columns are the original

+    ; rows. In the second pass, after the transposition, the colums are the

+    ; original columns.

+    ; The 1d transform is done by looping over bands of eight columns (the

+    ; idct32_bands loop). For each band, the transform input transposition

+    ; is done on demand, one band of four 8x8 matrices at a time. The four

+    ; matrices are transposed by pairs (the idct32_transpose_pair loop).

+    push  {r4-r11}

+    vpush {d8-d15}

+    ; stack operation

+    ; internal buffer used to transpose 8 lines into before transforming them

+    ;   int16_t transpose_buffer[32 * 8];

+    ;   at sp + [4096, 4607]

+    ; results of the first pass (transpose and transform rows)

+    ;   int16_t pass1[32 * 32];

+    ;   at sp + [0, 2047]

+    ; results of the second pass (transpose and transform columns)

+    ;   int16_t pass2[32 * 32];

+    ;   at sp + [2048, 4095]

+    sub sp, sp, #512+2048+2048

+    ; r6  = dest + 31 * dest_stride

+    ; r7  = dest +  0 * dest_stride

+    ; r9  = dest + 15 * dest_stride

+    ; r10 = dest + 16 * dest_stride

+    rsb r6,  r2, r2, lsl #5

+    rsb r9,  r2, r2, lsl #4

+    add r10, r1, r2, lsl #4

+    mov r7, r1

+    add r6, r6, r1

+    add r9, r9, r1

+    ; r11 = -dest_stride

+    neg r11, r2

+    ; r3 = input

+    mov r3, r0

+    ; parameters for first pass

+      ; r0 = transpose_buffer[32 * 8]

+    add r0, sp, #4096

+      ; r1 = pass1[32 * 32]

+    mov r1, sp

+    mov r5, #0          ; initialize pass loop counter

+idct32_pass_loop

+    mov r4, #4          ; initialize bands loop counter

+idct32_bands_loop

+    mov r8, #2          ; initialize transpose loop counter

+idct32_transpose_pair_loop

+    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one

+    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,

+    ; adjusted to 32 because of the two post-increments.

+    vld1.s16        {q8},  [r3]!

+    vld1.s16        {q0},  [r3]!

+    add r3, #32

+    vld1.s16        {q9},  [r3]!

+    vld1.s16        {q1},  [r3]!

+    add r3, #32

+    vld1.s16        {q10}, [r3]!

+    vld1.s16        {q2},  [r3]!

+    add r3, #32

+    vld1.s16        {q11}, [r3]!

+    vld1.s16        {q3},  [r3]!

+    add r3, #32

+    vld1.s16        {q12}, [r3]!

+    vld1.s16        {q4},  [r3]!

+    add r3, #32

+    vld1.s16        {q13}, [r3]!

+    vld1.s16        {q5},  [r3]!

+    add r3, #32

+    vld1.s16        {q14}, [r3]!

+    vld1.s16        {q6},  [r3]!

+    add r3, #32

+    vld1.s16        {q15}, [r3]!

+    vld1.s16        {q7},  [r3]!

+    ; Transpose the two 8x8 16bit data matrices.

+    vswp            d17, d24

+    vswp            d23, d30

+    vswp            d21, d28

+    vswp            d19, d26

+    vswp            d1,  d8

+    vswp            d7,  d14

+    vswp            d5,  d12

+    vswp            d3,  d10

+    vtrn.32         q8,  q10

+    vtrn.32         q9,  q11

+    vtrn.32         q12, q14

+    vtrn.32         q13, q15

+    vtrn.32         q0,  q2

+    vtrn.32         q1,  q3

+    vtrn.32         q4,  q6

+    vtrn.32         q5,  q7

+    vtrn.16         q8,  q9

+    vtrn.16         q10, q11

+    vtrn.16         q12, q13

+    vtrn.16         q14, q15

+    vtrn.16         q0,  q1

+    vtrn.16         q2,  q3

+    vtrn.16         q4,  q5

+    vtrn.16         q6,  q7

+    ; Store both matrices after each other. There is a stride of 32, which

+    ; adjusts to nothing because of the post-increments.

+    vst1.16        {q8},  [r0]!

+    vst1.16        {q9},  [r0]!

+    vst1.16        {q10}, [r0]!

+    vst1.16        {q11}, [r0]!

+    vst1.16        {q12}, [r0]!

+    vst1.16        {q13}, [r0]!

+    vst1.16        {q14}, [r0]!

+    vst1.16        {q15}, [r0]!

+    vst1.16        {q0},  [r0]!

+    vst1.16        {q1},  [r0]!

+    vst1.16        {q2},  [r0]!

+    vst1.16        {q3},  [r0]!

+    vst1.16        {q4},  [r0]!

+    vst1.16        {q5},  [r0]!

+    vst1.16        {q6},  [r0]!

+    vst1.16        {q7},  [r0]!

+    ; increment pointers by adjusted stride (not necessary for r0/out)

+    ;   go back by 7*32 for the seven lines moved fully by read and add

+    ;   go back by 32 for the eigth line only read

+    ;   advance by 16*2 to go the next pair

+    sub r3,  r3,  #7*32*2 + 32 - 16*2

+    ; transpose pair loop processing

+    subs r8, r8, #1

+    bne idct32_transpose_pair_loop

+    ; restore r0/input to its original value

+    sub r0, r0, #32*8*2

+    ; Instead of doing the transforms stage by stage, it is done by loading

+    ; some input values and doing as many stages as possible to minimize the

+    ; storing/loading of intermediate results. To fit within registers, the

+    ; final coefficients are cut into four blocks:

+    ; BLOCK A: 16-19,28-31

+    ; BLOCK B: 20-23,24-27

+    ; BLOCK C: 8-10,11-15

+    ; BLOCK D: 0-3,4-7

+    ; Blocks A and C are straight calculation through the various stages. In

+    ; block B, further calculations are performed using the results from

+    ; block A. In block D, further calculations are performed using the results

+    ; from block C and then the final calculations are done using results from

+    ; block A and B which have been combined at the end of block B.

+    ; --------------------------------------------------------------------------

+    ; BLOCK A: 16-19,28-31

+    ; --------------------------------------------------------------------------

+    ; generate 16,17,30,31

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;

+    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;

+    ;step1b[16][i] = dct_const_round_shift(temp1);

+    ;step1b[31][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 0, 1, 31

+    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;

+    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;

+    ;step1b[17][i] = dct_const_round_shift(temp1);

+    ;step1b[30][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 31, 17, 15

+    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;step2[16] =  step1b[16][i] + step1b[17][i];

+    ;step2[17] =  step1b[16][i] - step1b[17][i];

+    ;step2[30] = -step1b[30][i] + step1b[31][i];

+    ;step2[31] =  step1b[30][i] + step1b[31][i];

+    vadd.s16  q4, q0, q1

+    vsub.s16  q13, q0, q1

+    vadd.s16  q6, q2, q3

+    vsub.s16  q14, q2, q3

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;

+    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;

+    ;step3[17] = dct_const_round_shift(temp1);

+    ;step3[30] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15

+    ; --------------------------------------------------------------------------

+    ; generate 18,19,28,29

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;

+    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;

+    ;step1b[18][i] = dct_const_round_shift(temp1);

+    ;step1b[29][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 15, 9, 23

+    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;

+    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;

+    ;step1b[19][i] = dct_const_round_shift(temp1);

+    ;step1b[28][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 23, 25, 7

+    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;step2[18] = -step1b[18][i] + step1b[19][i];

+    ;step2[19] =  step1b[18][i] + step1b[19][i];

+    ;step2[28] =  step1b[28][i] + step1b[29][i];

+    ;step2[29] =  step1b[28][i] - step1b[29][i];

+    vsub.s16  q13, q3, q2

+    vadd.s16  q3,  q3, q2

+    vsub.s16  q14, q1, q0

+    vadd.s16  q2,  q1, q0

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);

+    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);

+    ;step3[29] = dct_const_round_shift(temp1);

+    ;step3[18] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1

+    ; --------------------------------------------------------------------------

+    ; combine 16-19,28-31

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;step1[16] = step1b[16][i] + step1b[19][i];

+    ;step1[17] = step1b[17][i] + step1b[18][i];

+    ;step1[18] = step1b[17][i] - step1b[18][i];

+    ;step1[29] = step1b[30][i] - step1b[29][i];

+    ;step1[30] = step1b[30][i] + step1b[29][i];

+    ;step1[31] = step1b[31][i] + step1b[28][i];

+    vadd.s16  q8,  q4, q2

+    vadd.s16  q9,  q5, q0

+    vadd.s16  q10, q7, q1

+    vadd.s16  q15, q6, q3

+    vsub.s16  q13, q5, q0

+    vsub.s16  q14, q7, q1

+    STORE_IN_OUTPUT 0,  16, 31, q8,  q15

+    STORE_IN_OUTPUT 31, 17, 30, q9,  q10

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;

+    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;

+    ;step2[18] = dct_const_round_shift(temp1);

+    ;step2[29] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3

+    STORE_IN_OUTPUT 30, 29, 18, q1, q0

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;step1[19] = step1b[16][i] - step1b[19][i];

+    ;step1[28] = step1b[31][i] - step1b[28][i];

+    vsub.s16  q13, q4, q2

+    vsub.s16  q14, q6, q3

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;

+    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;

+    ;step2[19] = dct_const_round_shift(temp1);

+    ;step2[28] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13

+    STORE_IN_OUTPUT 18, 19, 28, q4, q6

+    ; --------------------------------------------------------------------------

+    ; --------------------------------------------------------------------------

+    ; BLOCK B: 20-23,24-27

+    ; --------------------------------------------------------------------------

+    ; generate 20,21,26,27

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;

+    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;

+    ;step1b[20][i] = dct_const_round_shift(temp1);

+    ;step1b[27][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 7, 5, 27

+    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;

+    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;

+    ;step1b[21][i] = dct_const_round_shift(temp1);

+    ;step1b[26][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 27, 21, 11

+    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;step2[20] =  step1b[20][i] + step1b[21][i];

+    ;step2[21] =  step1b[20][i] - step1b[21][i];

+    ;step2[26] = -step1b[26][i] + step1b[27][i];

+    ;step2[27] =  step1b[26][i] + step1b[27][i];

+    vsub.s16  q13, q0, q1

+    vadd.s16  q0, q0, q1

+    vsub.s16  q14, q2, q3

+    vadd.s16  q2, q2, q3

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;

+    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;

+    ;step3[21] = dct_const_round_shift(temp1);

+    ;step3[26] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; generate 22,23,24,25

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;

+    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;

+    ;step1b[22][i] = dct_const_round_shift(temp1);

+    ;step1b[25][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 11, 13, 19

+    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15

+    ; --------------------------------------------------------------------------

+    ; part of stage 1

+    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;

+    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;

+    ;step1b[23][i] = dct_const_round_shift(temp1);

+    ;step1b[24][i] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 19, 29, 3

+    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;step2[22] = -step1b[22][i] + step1b[23][i];

+    ;step2[23] =  step1b[22][i] + step1b[23][i];

+    ;step2[24] =  step1b[24][i] + step1b[25][i];

+    ;step2[25] =  step1b[24][i] - step1b[25][i];

+    vsub.s16  q14, q4, q5

+    vadd.s16  q5, q4, q5

+    vsub.s16  q13, q6, q7

+    vadd.s16  q6, q6, q7

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);

+    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);

+    ;step3[25] = dct_const_round_shift(temp1);

+    ;step3[22] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15

+    ; --------------------------------------------------------------------------

+    ; combine 20-23,24-27

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;step1[22] = step1b[22][i] + step1b[21][i];

+    ;step1[23] = step1b[23][i] + step1b[20][i];

+    vadd.s16  q10, q7, q1

+    vadd.s16  q11, q5, q0

+    ;step1[24] = step1b[24][i] + step1b[27][i];

+    ;step1[25] = step1b[25][i] + step1b[26][i];

+    vadd.s16  q12, q6, q2

+    vadd.s16  q15, q4, q3

+    ; --------------------------------------------------------------------------

+    ; part of stage 6

+    ;step3[16] = step1b[16][i] + step1b[23][i];

+    ;step3[17] = step1b[17][i] + step1b[22][i];

+    ;step3[22] = step1b[17][i] - step1b[22][i];

+    ;step3[23] = step1b[16][i] - step1b[23][i];

+    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13

+    vadd.s16  q8,  q14, q11

+    vadd.s16  q9,  q13, q10

+    vsub.s16  q13, q13, q10

+    vsub.s16  q11, q14, q11

+    STORE_IN_OUTPUT 17, 17, 16, q9, q8

+    ; --------------------------------------------------------------------------

+    ; part of stage 6

+    ;step3[24] = step1b[31][i] - step1b[24][i];

+    ;step3[25] = step1b[30][i] - step1b[25][i];

+    ;step3[30] = step1b[30][i] + step1b[25][i];

+    ;step3[31] = step1b[31][i] + step1b[24][i];

+    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9

+    vsub.s16  q8,  q9,  q12

+    vadd.s16  q10, q14, q15

+    vsub.s16  q14, q14, q15

+    vadd.s16  q12, q9,  q12

+    STORE_IN_OUTPUT 31, 30, 31, q10, q12

+    ; --------------------------------------------------------------------------

+    ; TODO(cd) do some register allocation change to remove these push/pop

+    vpush {q8}  ; [24]

+    vpush {q11} ; [23]

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;

+    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;

+    ;step1[22] = dct_const_round_shift(temp1);

+    ;step1[25] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

+    STORE_IN_OUTPUT 31, 25, 22, q14, q13

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;

+    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;

+    ;step1[23] = dct_const_round_shift(temp1);

+    ;step1[24] = dct_const_round_shift(temp2);

+    ; TODO(cd) do some register allocation change to remove these push/pop

+    vpop  {q13} ; [23]

+    vpop  {q14} ; [24]

+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

+    STORE_IN_OUTPUT 22, 24, 23, q14, q13

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;step1[20] = step1b[23][i] - step1b[20][i];

+    ;step1[27] = step1b[24][i] - step1b[27][i];

+    vsub.s16  q14, q5, q0

+    vsub.s16  q13, q6, q2

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);

+    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);

+    ;step2[27] = dct_const_round_shift(temp1);

+    ;step2[20] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;step1[21] = step1b[22][i] - step1b[21][i];

+    ;step1[26] = step1b[25][i] - step1b[26][i];

+    vsub.s16  q14,  q7, q1

+    vsub.s16  q13,  q4, q3

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);

+    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);

+    ;step2[26] = dct_const_round_shift(temp1);

+    ;step2[21] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3

+    ; --------------------------------------------------------------------------

+    ; part of stage 6

+    ;step3[18] = step1b[18][i] + step1b[21][i];

+    ;step3[19] = step1b[19][i] + step1b[20][i];

+    ;step3[20] = step1b[19][i] - step1b[20][i];

+    ;step3[21] = step1b[18][i] - step1b[21][i];

+    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13

+    vadd.s16  q8,  q14, q1

+    vadd.s16  q9,  q13, q6

+    vsub.s16  q13, q13, q6

+    vsub.s16  q1,  q14, q1

+    STORE_IN_OUTPUT 19, 18, 19, q8, q9

+    ; --------------------------------------------------------------------------

+    ; part of stage 6

+    ;step3[27] = step1b[28][i] - step1b[27][i];

+    ;step3[28] = step1b[28][i] + step1b[27][i];

+    ;step3[29] = step1b[29][i] + step1b[26][i];

+    ;step3[26] = step1b[29][i] - step1b[26][i];

+    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9

+    vsub.s16  q14, q8, q5

+    vadd.s16  q10, q8, q5

+    vadd.s16  q11, q9, q0

+    vsub.s16  q0, q9, q0

+    STORE_IN_OUTPUT 29, 28, 29, q10, q11

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;

+    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;

+    ;step1[20] = dct_const_round_shift(temp1);

+    ;step1[27] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29

+    STORE_IN_OUTPUT 29, 20, 27, q13, q14

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;

+    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;

+    ;step1[21] = dct_const_round_shift(temp1);

+    ;step1[26] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1

+    STORE_IN_OUTPUT 27, 21, 26, q1, q0

+    ; --------------------------------------------------------------------------

+    ; --------------------------------------------------------------------------

+    ; BLOCK C: 8-10,11-15

+    ; --------------------------------------------------------------------------

+    ; generate 8,9,14,15

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;

+    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;

+    ;step2[8] = dct_const_round_shift(temp1);

+    ;step2[15] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 3, 2, 30

+    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;

+    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;

+    ;step2[9] = dct_const_round_shift(temp1);

+    ;step2[14] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 30, 18, 14

+    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;step3[8] = step1b[8][i] + step1b[9][i];

+    ;step3[9] = step1b[8][i] - step1b[9][i];

+    ;step3[14] = step1b[15][i] - step1b[14][i];

+    ;step3[15] = step1b[15][i] + step1b[14][i];

+    vsub.s16  q13, q0, q1

+    vadd.s16  q0, q0, q1

+    vsub.s16  q14, q2, q3

+    vadd.s16  q2, q2, q3

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;

+    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;

+    ;step1[9]  = dct_const_round_shift(temp1);

+    ;step1[14] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; generate 10,11,12,13

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;

+    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;

+    ;step2[10] = dct_const_round_shift(temp1);

+    ;step2[13] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 14, 10, 22

+    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15

+    ; --------------------------------------------------------------------------

+    ; part of stage 2

+    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;

+    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;

+    ;step2[11] = dct_const_round_shift(temp1);

+    ;step2[12] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 22, 26, 6

+    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;step3[10] = step1b[11][i] - step1b[10][i];

+    ;step3[11] = step1b[11][i] + step1b[10][i];

+    ;step3[12] = step1b[12][i] + step1b[13][i];

+    ;step3[13] = step1b[12][i] - step1b[13][i];

+    vsub.s16  q14, q4, q5

+    vadd.s16  q5, q4, q5

+    vsub.s16  q13, q6, q7

+    vadd.s16  q6, q6, q7

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);

+    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);

+    ;step1[13] = dct_const_round_shift(temp1);

+    ;step1[10] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15

+    ; --------------------------------------------------------------------------

+    ; combine 8-10,11-15

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;step2[8]  = step1b[8][i] + step1b[11][i];

+    ;step2[9]  = step1b[9][i] + step1b[10][i];

+    ;step2[10] = step1b[9][i] - step1b[10][i];

+    vadd.s16  q8,  q0, q5

+    vadd.s16  q9,  q1, q7

+    vsub.s16  q13, q1, q7

+    ;step2[13] = step1b[14][i] - step1b[13][i];

+    ;step2[14] = step1b[14][i] + step1b[13][i];

+    ;step2[15] = step1b[15][i] + step1b[12][i];

+    vsub.s16  q14, q3, q4

+    vadd.s16  q10, q3, q4

+    vadd.s16  q15, q2, q6

+    STORE_IN_OUTPUT 26, 8, 15, q8, q15

+    STORE_IN_OUTPUT 15, 9, 14, q9, q10

+    ; --------------------------------------------------------------------------

+    ; part of stage 6

+    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;

+    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;

+    ;step3[10] = dct_const_round_shift(temp1);

+    ;step3[13] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

+    STORE_IN_OUTPUT 14, 13, 10, q3, q1

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;step2[11] = step1b[8][i] - step1b[11][i];

+    ;step2[12] = step1b[15][i] - step1b[12][i];

+    vsub.s16  q13, q0, q5

+    vsub.s16  q14,  q2, q6

+    ; --------------------------------------------------------------------------

+    ; part of stage 6

+    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;

+    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;

+    ;step3[11] = dct_const_round_shift(temp1);

+    ;step3[12] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

+    STORE_IN_OUTPUT 10, 11, 12, q1, q3

+    ; --------------------------------------------------------------------------

+    ; --------------------------------------------------------------------------

+    ; BLOCK D: 0-3,4-7

+    ; --------------------------------------------------------------------------

+    ; generate 4,5,6,7

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;

+    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;

+    ;step3[4] = dct_const_round_shift(temp1);

+    ;step3[7] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 6, 4, 28

+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5

+    ; --------------------------------------------------------------------------

+    ; part of stage 3

+    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;

+    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;

+    ;step3[5] = dct_const_round_shift(temp1);

+    ;step3[6] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 28, 20, 12

+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;step1[4] = step1b[4][i] + step1b[5][i];

+    ;step1[5] = step1b[4][i] - step1b[5][i];

+    ;step1[6] = step1b[7][i] - step1b[6][i];

+    ;step1[7] = step1b[7][i] + step1b[6][i];

+    vsub.s16  q13, q0, q1

+    vadd.s16  q0, q0, q1

+    vsub.s16  q14, q2, q3

+    vadd.s16  q2, q2, q3

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;

+    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;

+    ;step2[5] = dct_const_round_shift(temp1);

+    ;step2[6] = dct_const_round_shift(temp2);

+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7

+    ; --------------------------------------------------------------------------

+    ; generate 0,1,2,3

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;

+    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;

+    ;step1[1] = dct_const_round_shift(temp1);

+    ;step1[0] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 12, 0, 16

+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15

+    ; --------------------------------------------------------------------------

+    ; part of stage 4

+    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;

+    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;

+    ;step1[2] = dct_const_round_shift(temp1);

+    ;step1[3] = dct_const_round_shift(temp2);

+    LOAD_FROM_TRANSPOSED 16, 8, 24

+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13

+    ; --------------------------------------------------------------------------

+    ; part of stage 5

+    ;step2[0] = step1b[0][i] + step1b[3][i];

+    ;step2[1] = step1b[1][i] + step1b[2][i];

+    ;step2[2] = step1b[1][i] - step1b[2][i];

+    ;step2[3] = step1b[0][i] - step1b[3][i];

+    vadd.s16  q4, q7, q6

+    vsub.s16  q7, q7, q6

+    vsub.s16  q6, q5, q14

+    vadd.s16  q5, q5, q14

+    ; --------------------------------------------------------------------------

+    ; combine 0-3,4-7

+    ; --------------------------------------------------------------------------

+    ; part of stage 6

+    ;step3[0] = step1b[0][i] + step1b[7][i];

+    ;step3[1] = step1b[1][i] + step1b[6][i];

+    ;step3[2] = step1b[2][i] + step1b[5][i];

+    ;step3[3] = step1b[3][i] + step1b[4][i];

+    vadd.s16  q8,  q4, q2

+    vadd.s16  q9,  q5, q3

+    vadd.s16  q10, q6, q1

+    vadd.s16  q11, q7, q0

+    ;step3[4] = step1b[3][i] - step1b[4][i];

+    ;step3[5] = step1b[2][i] - step1b[5][i];

+    ;step3[6] = step1b[1][i] - step1b[6][i];

+    ;step3[7] = step1b[0][i] - step1b[7][i];

+    vsub.s16  q12, q7, q0

+    vsub.s16  q13, q6, q1

+    vsub.s16  q14, q5, q3

+    vsub.s16  q15, q4, q2

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;step1[0] = step1b[0][i] + step1b[15][i];

+    ;step1[1] = step1b[1][i] + step1b[14][i];

+    ;step1[14] = step1b[1][i] - step1b[14][i];

+    ;step1[15] = step1b[0][i] - step1b[15][i];

+    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1

+    vadd.s16  q2, q8, q1

+    vadd.s16  q3, q9, q0

+    vsub.s16  q4, q9, q0

+    vsub.s16  q5, q8, q1

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[14 * 32] = step1b[14][i] + step1b[17][i];

+    ;output[15 * 32] = step1b[15][i] + step1b[16][i];

+    ;output[16 * 32] = step1b[15][i] - step1b[16][i];

+    ;output[17 * 32] = step1b[14][i] - step1b[17][i];

+    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1

+    vadd.s16  q8, q4, q1

+    vadd.s16  q9, q5, q0

+    vsub.s16  q6, q5, q0

+    vsub.s16  q7, q4, q1

+    cmp r5, #0

+    bgt idct32_bands_end_2nd_pass

+idct32_bands_end_1st_pass

+    STORE_IN_OUTPUT 17, 16, 17, q6, q7

+    STORE_IN_OUTPUT 17, 14, 15, q8, q9

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];

+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];

+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];

+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];

+    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_IN_OUTPUT 31, 30, 31, q6, q7

+    STORE_IN_OUTPUT 31,  0,  1, q4, q5

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;step1[2] = step1b[2][i] + step1b[13][i];

+    ;step1[3] = step1b[3][i] + step1b[12][i];

+    ;step1[12] = step1b[3][i] - step1b[12][i];

+    ;step1[13] = step1b[2][i] - step1b[13][i];

+    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1

+    vadd.s16  q2, q10, q1

+    vadd.s16  q3, q11, q0

+    vsub.s16  q4, q11, q0

+    vsub.s16  q5, q10, q1

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];

+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];

+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];

+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];

+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1

+    vadd.s16  q8, q4, q1

+    vadd.s16  q9, q5, q0

+    vsub.s16  q6, q5, q0

+    vsub.s16  q7, q4, q1

+    STORE_IN_OUTPUT 19, 18, 19, q6, q7

+    STORE_IN_OUTPUT 19, 12, 13, q8, q9

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];

+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];

+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];

+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];

+    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_IN_OUTPUT 29, 28, 29, q6, q7

+    STORE_IN_OUTPUT 29,  2,  3, q4, q5

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;step1[4] = step1b[4][i] + step1b[11][i];

+    ;step1[5] = step1b[5][i] + step1b[10][i];

+    ;step1[10] = step1b[5][i] - step1b[10][i];

+    ;step1[11] = step1b[4][i] - step1b[11][i];

+    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1

+    vadd.s16  q2, q12, q1

+    vadd.s16  q3, q13, q0

+    vsub.s16  q4, q13, q0

+    vsub.s16  q5, q12, q1

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];

+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];

+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];

+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];

+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1

+    vadd.s16  q8, q4, q1

+    vadd.s16  q9, q5, q0

+    vsub.s16  q6, q5, q0

+    vsub.s16  q7, q4, q1

+    STORE_IN_OUTPUT 21, 20, 21, q6, q7

+    STORE_IN_OUTPUT 21, 10, 11, q8, q9

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];

+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];

+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];

+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];

+    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_IN_OUTPUT 27, 26, 27, q6, q7

+    STORE_IN_OUTPUT 27,  4,  5, q4, q5

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;step1[6] = step1b[6][i] + step1b[9][i];

+    ;step1[7] = step1b[7][i] + step1b[8][i];

+    ;step1[8] = step1b[7][i] - step1b[8][i];

+    ;step1[9] = step1b[6][i] - step1b[9][i];

+    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1

+    vadd.s16  q2, q14, q1

+    vadd.s16  q3, q15, q0

+    vsub.s16  q4, q15, q0

+    vsub.s16  q5, q14, q1

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];

+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];

+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];

+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];

+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1

+    vadd.s16  q8, q4, q1

+    vadd.s16  q9, q5, q0

+    vsub.s16  q6, q5, q0

+    vsub.s16  q7, q4, q1

+    STORE_IN_OUTPUT 23, 22, 23, q6, q7

+    STORE_IN_OUTPUT 23, 8, 9, q8, q9

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];

+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];

+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];

+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];

+    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_IN_OUTPUT 25, 24, 25, q6, q7

+    STORE_IN_OUTPUT 25,  6,  7, q4, q5

+    ; restore r0 by removing the last offset from the last

+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2

+    sub r0, r0, #24*8*2

+    ; restore r1 by removing the last offset from the last

+    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2

+    ; advance by 8 columns => 8*2

+    sub r1, r1, #7*32*2 - 8*2

+    ;   advance by 8 lines (8*32*2)

+    ;   go back by the two pairs from the loop (32*2)

+    add r3, r3, #8*32*2 - 32*2

+    ; bands loop processing

+    subs r4, r4, #1

+    bne idct32_bands_loop

+    ; parameters for second pass

+    ; the input of pass2 is the result of pass1. we have to remove the offset

+    ;   of 32 columns induced by the above idct32_bands_loop

+    sub r3, r1, #32*2

+      ; r1 = pass2[32 * 32]

+    add r1, sp, #2048

+    ; pass loop processing

+    add r5, r5, #1

+    b idct32_pass_loop

+idct32_bands_end_2nd_pass

+    STORE_COMBINE_CENTER_RESULTS

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];

+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];

+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];

+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];

+    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_COMBINE_EXTREME_RESULTS

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;step1[2] = step1b[2][i] + step1b[13][i];

+    ;step1[3] = step1b[3][i] + step1b[12][i];

+    ;step1[12] = step1b[3][i] - step1b[12][i];

+    ;step1[13] = step1b[2][i] - step1b[13][i];

+    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1

+    vadd.s16  q2, q10, q1

+    vadd.s16  q3, q11, q0

+    vsub.s16  q4, q11, q0

+    vsub.s16  q5, q10, q1

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];

+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];

+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];

+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];

+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1

+    vadd.s16  q8, q4, q1

+    vadd.s16  q9, q5, q0

+    vsub.s16  q6, q5, q0

+    vsub.s16  q7, q4, q1

+    STORE_COMBINE_CENTER_RESULTS

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];

+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];

+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];

+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];

+    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_COMBINE_EXTREME_RESULTS

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;step1[4] = step1b[4][i] + step1b[11][i];

+    ;step1[5] = step1b[5][i] + step1b[10][i];

+    ;step1[10] = step1b[5][i] - step1b[10][i];

+    ;step1[11] = step1b[4][i] - step1b[11][i];

+    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1

+    vadd.s16  q2, q12, q1

+    vadd.s16  q3, q13, q0

+    vsub.s16  q4, q13, q0

+    vsub.s16  q5, q12, q1

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];

+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];

+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];

+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];

+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1

+    vadd.s16  q8, q4, q1

+    vadd.s16  q9, q5, q0

+    vsub.s16  q6, q5, q0

+    vsub.s16  q7, q4, q1

+    STORE_COMBINE_CENTER_RESULTS

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];

+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];

+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];

+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];

+    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_COMBINE_EXTREME_RESULTS

+    ; --------------------------------------------------------------------------

+    ; part of stage 7

+    ;step1[6] = step1b[6][i] + step1b[9][i];

+    ;step1[7] = step1b[7][i] + step1b[8][i];

+    ;step1[8] = step1b[7][i] - step1b[8][i];

+    ;step1[9] = step1b[6][i] - step1b[9][i];

+    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1

+    vadd.s16  q2, q14, q1

+    vadd.s16  q3, q15, q0

+    vsub.s16  q4, q15, q0

+    vsub.s16  q5, q14, q1

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];

+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];

+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];

+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];

+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1

+    vadd.s16  q8, q4, q1

+    vadd.s16  q9, q5, q0

+    vsub.s16  q6, q5, q0

+    vsub.s16  q7, q4, q1

+    STORE_COMBINE_CENTER_RESULTS_LAST

+    ; --------------------------------------------------------------------------

+    ; part of final stage

+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];

+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];

+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];

+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];

+    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1

+    vadd.s16  q4, q2, q1

+    vadd.s16  q5, q3, q0

+    vsub.s16  q6, q3, q0

+    vsub.s16  q7, q2, q1

+    STORE_COMBINE_EXTREME_RESULTS_LAST

+    ; --------------------------------------------------------------------------

+    ; restore pointers to their initial indices for next band pass by

+    ;     removing/adding dest_stride * 8. The actual increment by eight

+    ;     is taken care of within the _LAST macros.

+    add r6,  r6,  r2, lsl #3

+    add r9,  r9,  r2, lsl #3

+    sub r7,  r7,  r2, lsl #3

+    sub r10, r10, r2, lsl #3

+    ; restore r0 by removing the last offset from the last

+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2

+    sub r0, r0, #24*8*2

+    ; restore r1 by removing the last offset from the last

+    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2

+    ; advance by 8 columns => 8*2

+    sub r1, r1, #25*32*2 - 8*2

+    ;   advance by 8 lines (8*32*2)

+    ;   go back by the two pairs from the loop (32*2)

+    add r3, r3, #8*32*2 - 32*2

+    ; bands loop processing

+    subs r4, r4, #1

+    bne idct32_bands_loop

+    ; stack operation

+    add sp, sp, #512+2048+2048

+    vpop {d8-d15}

+    pop  {r4-r11}

+    bx              lr

+    ENDP  ; |vp9_idct32x32_1024_add_neon|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct32x32_add_neon.c

@@ -1,0 +1,719 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_config.h"

+#include "vpx_dsp/txfm_common.h"

+#define LOAD_FROM_TRANSPOSED(prev, first, second) \

+    q14s16 = vld1q_s16(trans_buf + first * 8); \

+    q13s16 = vld1q_s16(trans_buf + second * 8);

+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \

+    qA = vld1q_s16(out + first * 32); \

+    qB = vld1q_s16(out + second * 32);

+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \

+    vst1q_s16(out + first * 32, qA); \

+    vst1q_s16(out + second * 32, qB);

+#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \

+       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \

+                                      q6s16, q7s16, q8s16, q9s16);

+static INLINE void __STORE_COMBINE_CENTER_RESULTS(

+        uint8_t *p1,

+        uint8_t *p2,

+        int stride,

+        int16x8_t q6s16,

+        int16x8_t q7s16,

+        int16x8_t q8s16,

+        int16x8_t q9s16) {

+    int16x4_t d8s16, d9s16, d10s16, d11s16;

+    d8s16 = vld1_s16((int16_t *)p1);

+    p1 += stride;

+    d11s16 = vld1_s16((int16_t *)p2);

+    p2 -= stride;

+    d9s16 = vld1_s16((int16_t *)p1);

+    d10s16 = vld1_s16((int16_t *)p2);

+    q7s16 = vrshrq_n_s16(q7s16, 6);

+    q8s16 = vrshrq_n_s16(q8s16, 6);

+    q9s16 = vrshrq_n_s16(q9s16, 6);

+    q6s16 = vrshrq_n_s16(q6s16, 6);

+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),

+                                           vreinterpret_u8_s16(d9s16)));

+    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),

+                                           vreinterpret_u8_s16(d10s16)));

+    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),

+                                           vreinterpret_u8_s16(d11s16)));

+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),

+                                           vreinterpret_u8_s16(d8s16)));

+    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));

+    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));

+    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));

+    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));

+    vst1_s16((int16_t *)p1, d9s16);

+    p1 -= stride;

+    vst1_s16((int16_t *)p2, d10s16);

+    p2 += stride;

+    vst1_s16((int16_t *)p1, d8s16);

+    vst1_s16((int16_t *)p2, d11s16);

+    return;

+}

+#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \

+       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \

+                                      q4s16, q5s16, q6s16, q7s16);

+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(

+        uint8_t *p1,

+        uint8_t *p2,

+        int stride,

+        int16x8_t q4s16,

+        int16x8_t q5s16,

+        int16x8_t q6s16,

+        int16x8_t q7s16) {

+    int16x4_t d4s16, d5s16, d6s16, d7s16;

+    d4s16 = vld1_s16((int16_t *)p1);

+    p1 += stride;

+    d7s16 = vld1_s16((int16_t *)p2);

+    p2 -= stride;

+    d5s16 = vld1_s16((int16_t *)p1);

+    d6s16 = vld1_s16((int16_t *)p2);

+    q5s16 = vrshrq_n_s16(q5s16, 6);

+    q6s16 = vrshrq_n_s16(q6s16, 6);

+    q7s16 = vrshrq_n_s16(q7s16, 6);

+    q4s16 = vrshrq_n_s16(q4s16, 6);

+    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),

+                                           vreinterpret_u8_s16(d5s16)));

+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),

+                                           vreinterpret_u8_s16(d6s16)));

+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),

+                                           vreinterpret_u8_s16(d7s16)));

+    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),

+                                           vreinterpret_u8_s16(d4s16)));

+    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));

+    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));

+    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));

+    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));

+    vst1_s16((int16_t *)p1, d5s16);

+    p1 -= stride;

+    vst1_s16((int16_t *)p2, d6s16);

+    p2 += stride;

+    vst1_s16((int16_t *)p2, d7s16);

+    vst1_s16((int16_t *)p1, d4s16);

+    return;

+}

+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \

+        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);

+static INLINE void DO_BUTTERFLY(

+        int16x8_t q14s16,

+        int16x8_t q13s16,

+        int16_t first_const,

+        int16_t second_const,

+        int16x8_t *qAs16,

+        int16x8_t *qBs16) {

+    int16x4_t d30s16, d31s16;

+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;

+    int16x4_t dCs16, dDs16, dAs16, dBs16;

+    dCs16 = vget_low_s16(q14s16);

+    dDs16 = vget_high_s16(q14s16);

+    dAs16 = vget_low_s16(q13s16);

+    dBs16 = vget_high_s16(q13s16);

+    d30s16 = vdup_n_s16(first_const);

+    d31s16 = vdup_n_s16(second_const);

+    q8s32 = vmull_s16(dCs16, d30s16);

+    q10s32 = vmull_s16(dAs16, d31s16);

+    q9s32 = vmull_s16(dDs16, d30s16);

+    q11s32 = vmull_s16(dBs16, d31s16);

+    q12s32 = vmull_s16(dCs16, d31s16);

+    q8s32 = vsubq_s32(q8s32, q10s32);

+    q9s32 = vsubq_s32(q9s32, q11s32);

+    q10s32 = vmull_s16(dDs16, d31s16);

+    q11s32 = vmull_s16(dAs16, d30s16);

+    q15s32 = vmull_s16(dBs16, d30s16);

+    q11s32 = vaddq_s32(q12s32, q11s32);

+    q10s32 = vaddq_s32(q10s32, q15s32);

+    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),

+                          vqrshrn_n_s32(q9s32, 14));

+    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),

+                          vqrshrn_n_s32(q10s32, 14));

+    return;

+}

+static INLINE void idct32_transpose_pair(

+        int16_t *input,

+        int16_t *t_buf) {

+    int16_t *in;

+    int i;

+    const int stride = 32;

+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;

+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;

+    for (i = 0; i < 4; i++, input += 8) {

+        in = input;

+        q8s16 = vld1q_s16(in);

+        in += stride;

+        q9s16 = vld1q_s16(in);

+        in += stride;

+        q10s16 = vld1q_s16(in);

+        in += stride;

+        q11s16 = vld1q_s16(in);

+        in += stride;

+        q12s16 = vld1q_s16(in);

+        in += stride;

+        q13s16 = vld1q_s16(in);

+        in += stride;

+        q14s16 = vld1q_s16(in);

+        in += stride;

+        q15s16 = vld1q_s16(in);

+        d16s16 = vget_low_s16(q8s16);

+        d17s16 = vget_high_s16(q8s16);

+        d18s16 = vget_low_s16(q9s16);

+        d19s16 = vget_high_s16(q9s16);

+        d20s16 = vget_low_s16(q10s16);

+        d21s16 = vget_high_s16(q10s16);

+        d22s16 = vget_low_s16(q11s16);

+        d23s16 = vget_high_s16(q11s16);

+        d24s16 = vget_low_s16(q12s16);

+        d25s16 = vget_high_s16(q12s16);

+        d26s16 = vget_low_s16(q13s16);

+        d27s16 = vget_high_s16(q13s16);

+        d28s16 = vget_low_s16(q14s16);

+        d29s16 = vget_high_s16(q14s16);

+        d30s16 = vget_low_s16(q15s16);

+        d31s16 = vget_high_s16(q15s16);

+        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24

+        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26

+        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28

+        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30

+        q12s16 = vcombine_s16(d17s16, d25s16);

+        q13s16 = vcombine_s16(d19s16, d27s16);

+        q14s16 = vcombine_s16(d21s16, d29s16);

+        q15s16 = vcombine_s16(d23s16, d31s16);

+        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),

+                            vreinterpretq_s32_s16(q10s16));

+        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),

+                            vreinterpretq_s32_s16(q11s16));

+        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),

+                            vreinterpretq_s32_s16(q14s16));

+        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),

+                            vreinterpretq_s32_s16(q15s16));

+        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8

+                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9

+        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10

+                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11

+        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12

+                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13

+        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14

+                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15

+        vst1q_s16(t_buf, q0x2s16.val[0]);

+        t_buf += 8;

+        vst1q_s16(t_buf, q0x2s16.val[1]);

+        t_buf += 8;

+        vst1q_s16(t_buf, q1x2s16.val[0]);

+        t_buf += 8;

+        vst1q_s16(t_buf, q1x2s16.val[1]);

+        t_buf += 8;

+        vst1q_s16(t_buf, q2x2s16.val[0]);

+        t_buf += 8;

+        vst1q_s16(t_buf, q2x2s16.val[1]);

+        t_buf += 8;

+        vst1q_s16(t_buf, q3x2s16.val[0]);

+        t_buf += 8;

+        vst1q_s16(t_buf, q3x2s16.val[1]);

+        t_buf += 8;

+    }

+    return;

+}

+static INLINE void idct32_bands_end_1st_pass(

+        int16_t *out,

+        int16x8_t q2s16,

+        int16x8_t q3s16,

+        int16x8_t q6s16,

+        int16x8_t q7s16,

+        int16x8_t q8s16,

+        int16x8_t q9s16,

+        int16x8_t q10s16,

+        int16x8_t q11s16,

+        int16x8_t q12s16,

+        int16x8_t q13s16,

+        int16x8_t q14s16,

+        int16x8_t q15s16) {

+    int16x8_t q0s16, q1s16, q4s16, q5s16;

+    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);

+    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);

+    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);

+    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);

+    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);

+    q2s16 = vaddq_s16(q10s16, q1s16);

+    q3s16 = vaddq_s16(q11s16, q0s16);

+    q4s16 = vsubq_s16(q11s16, q0s16);

+    q5s16 = vsubq_s16(q10s16, q1s16);

+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);

+    q8s16 = vaddq_s16(q4s16, q1s16);

+    q9s16 = vaddq_s16(q5s16, q0s16);

+    q6s16 = vsubq_s16(q5s16, q0s16);

+    q7s16 = vsubq_s16(q4s16, q1s16);

+    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);

+    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);

+    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);

+    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);

+    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);

+    q2s16 = vaddq_s16(q12s16, q1s16);

+    q3s16 = vaddq_s16(q13s16, q0s16);

+    q4s16 = vsubq_s16(q13s16, q0s16);

+    q5s16 = vsubq_s16(q12s16, q1s16);

+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);

+    q8s16 = vaddq_s16(q4s16, q1s16);

+    q9s16 = vaddq_s16(q5s16, q0s16);

+    q6s16 = vsubq_s16(q5s16, q0s16);

+    q7s16 = vsubq_s16(q4s16, q1s16);

+    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);

+    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);

+    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);

+    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);

+    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);

+    q2s16 = vaddq_s16(q14s16, q1s16);

+    q3s16 = vaddq_s16(q15s16, q0s16);

+    q4s16 = vsubq_s16(q15s16, q0s16);

+    q5s16 = vsubq_s16(q14s16, q1s16);

+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);

+    q8s16 = vaddq_s16(q4s16, q1s16);

+    q9s16 = vaddq_s16(q5s16, q0s16);

+    q6s16 = vsubq_s16(q5s16, q0s16);

+    q7s16 = vsubq_s16(q4s16, q1s16);

+    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);

+    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);

+    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);

+    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);

+    return;

+}

+static INLINE void idct32_bands_end_2nd_pass(

+        int16_t *out,

+        uint8_t *dest,

+        int stride,

+        int16x8_t q2s16,

+        int16x8_t q3s16,

+        int16x8_t q6s16,

+        int16x8_t q7s16,

+        int16x8_t q8s16,

+        int16x8_t q9s16,

+        int16x8_t q10s16,

+        int16x8_t q11s16,

+        int16x8_t q12s16,

+        int16x8_t q13s16,

+        int16x8_t q14s16,

+        int16x8_t q15s16) {

+    uint8_t *r6  = dest + 31 * stride;

+    uint8_t *r7  = dest/* +  0 * stride*/;

+    uint8_t *r9  = dest + 15 * stride;

+    uint8_t *r10 = dest + 16 * stride;

+    int str2 = stride << 1;

+    int16x8_t q0s16, q1s16, q4s16, q5s16;

+    STORE_COMBINE_CENTER_RESULTS(r10, r9);

+    r10 += str2; r9 -= str2;

+    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

+    r7 += str2; r6 -= str2;

+    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)

+    q2s16 = vaddq_s16(q10s16, q1s16);

+    q3s16 = vaddq_s16(q11s16, q0s16);

+    q4s16 = vsubq_s16(q11s16, q0s16);

+    q5s16 = vsubq_s16(q10s16, q1s16);

+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)

+    q8s16 = vaddq_s16(q4s16, q1s16);

+    q9s16 = vaddq_s16(q5s16, q0s16);

+    q6s16 = vsubq_s16(q5s16, q0s16);

+    q7s16 = vsubq_s16(q4s16, q1s16);

+    STORE_COMBINE_CENTER_RESULTS(r10, r9);

+    r10 += str2; r9 -= str2;

+    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

+    r7 += str2; r6 -= str2;

+    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)

+    q2s16 = vaddq_s16(q12s16, q1s16);

+    q3s16 = vaddq_s16(q13s16, q0s16);

+    q4s16 = vsubq_s16(q13s16, q0s16);

+    q5s16 = vsubq_s16(q12s16, q1s16);

+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)

+    q8s16 = vaddq_s16(q4s16, q1s16);

+    q9s16 = vaddq_s16(q5s16, q0s16);

+    q6s16 = vsubq_s16(q5s16, q0s16);

+    q7s16 = vsubq_s16(q4s16, q1s16);

+    STORE_COMBINE_CENTER_RESULTS(r10, r9);

+    r10 += str2; r9 -= str2;

+    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

+    r7 += str2; r6 -= str2;

+    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)

+    q2s16 = vaddq_s16(q14s16, q1s16);

+    q3s16 = vaddq_s16(q15s16, q0s16);

+    q4s16 = vsubq_s16(q15s16, q0s16);

+    q5s16 = vsubq_s16(q14s16, q1s16);

+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)

+    q8s16 = vaddq_s16(q4s16, q1s16);

+    q9s16 = vaddq_s16(q5s16, q0s16);

+    q6s16 = vsubq_s16(q5s16, q0s16);

+    q7s16 = vsubq_s16(q4s16, q1s16);

+    STORE_COMBINE_CENTER_RESULTS(r10, r9);

+    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)

+    q4s16 = vaddq_s16(q2s16, q1s16);

+    q5s16 = vaddq_s16(q3s16, q0s16);

+    q6s16 = vsubq_s16(q3s16, q0s16);

+    q7s16 = vsubq_s16(q2s16, q1s16);

+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);

+    return;

+}

+void vp9_idct32x32_1024_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int stride) {

+    int i, idct32_pass_loop;

+    int16_t trans_buf[32 * 8];

+    int16_t pass1[32 * 32];

+    int16_t pass2[32 * 32];

+    int16_t *out;

+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    for (idct32_pass_loop = 0, out = pass1;

+         idct32_pass_loop < 2;

+         idct32_pass_loop++,

+         input = pass1,  // the input of pass2 is the result of pass1

+         out = pass2) {

+        for (i = 0;

+             i < 4; i++,

+             input += 32 * 8, out += 8) {  // idct32_bands_loop

+            idct32_transpose_pair(input, trans_buf);

+            // -----------------------------------------

+            // BLOCK A: 16-19,28-31

+            // -----------------------------------------

+            // generate 16,17,30,31

+            // part of stage 1

+            LOAD_FROM_TRANSPOSED(0, 1, 31)

+            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)

+            LOAD_FROM_TRANSPOSED(31, 17, 15)

+            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)

+            // part of stage 2

+            q4s16 = vaddq_s16(q0s16, q1s16);

+            q13s16 = vsubq_s16(q0s16, q1s16);

+            q6s16 = vaddq_s16(q2s16, q3s16);

+            q14s16 = vsubq_s16(q2s16, q3s16);

+            // part of stage 3

+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)

+            // generate 18,19,28,29

+            // part of stage 1

+            LOAD_FROM_TRANSPOSED(15, 9, 23)

+            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)

+            LOAD_FROM_TRANSPOSED(23, 25, 7)

+            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)

+            // part of stage 2

+            q13s16 = vsubq_s16(q3s16, q2s16);

+            q3s16 = vaddq_s16(q3s16, q2s16);

+            q14s16 = vsubq_s16(q1s16, q0s16);

+            q2s16 = vaddq_s16(q1s16, q0s16);

+            // part of stage 3

+            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)

+            // part of stage 4

+            q8s16 = vaddq_s16(q4s16, q2s16);

+            q9s16 = vaddq_s16(q5s16, q0s16);

+            q10s16 = vaddq_s16(q7s16, q1s16);

+            q15s16 = vaddq_s16(q6s16, q3s16);

+            q13s16 = vsubq_s16(q5s16, q0s16);

+            q14s16 = vsubq_s16(q7s16, q1s16);

+            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)

+            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)

+            // part of stage 5

+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)

+            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)

+            // part of stage 4

+            q13s16 = vsubq_s16(q4s16, q2s16);

+            q14s16 = vsubq_s16(q6s16, q3s16);

+            // part of stage 5

+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)

+            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)

+            // -----------------------------------------

+            // BLOCK B: 20-23,24-27

+            // -----------------------------------------

+            // generate 20,21,26,27

+            // part of stage 1

+            LOAD_FROM_TRANSPOSED(7, 5, 27)

+            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)

+            LOAD_FROM_TRANSPOSED(27, 21, 11)

+            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)

+            // part of stage 2

+            q13s16 = vsubq_s16(q0s16, q1s16);

+            q0s16 = vaddq_s16(q0s16, q1s16);

+            q14s16 = vsubq_s16(q2s16, q3s16);

+            q2s16 = vaddq_s16(q2s16, q3s16);

+            // part of stage 3

+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)

+            // generate 22,23,24,25

+            // part of stage 1

+            LOAD_FROM_TRANSPOSED(11, 13, 19)

+            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)

+            LOAD_FROM_TRANSPOSED(19, 29, 3)

+            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)

+            // part of stage 2

+            q14s16 = vsubq_s16(q4s16, q5s16);

+            q5s16  = vaddq_s16(q4s16, q5s16);

+            q13s16 = vsubq_s16(q6s16, q7s16);

+            q6s16  = vaddq_s16(q6s16, q7s16);

+            // part of stage 3

+            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)

+            // part of stage 4

+            q10s16 = vaddq_s16(q7s16, q1s16);

+            q11s16 = vaddq_s16(q5s16, q0s16);

+            q12s16 = vaddq_s16(q6s16, q2s16);

+            q15s16 = vaddq_s16(q4s16, q3s16);

+            // part of stage 6

+            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)

+            q8s16 = vaddq_s16(q14s16, q11s16);

+            q9s16 = vaddq_s16(q13s16, q10s16);

+            q13s16 = vsubq_s16(q13s16, q10s16);

+            q11s16 = vsubq_s16(q14s16, q11s16);

+            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)

+            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)

+            q8s16  = vsubq_s16(q9s16, q12s16);

+            q10s16 = vaddq_s16(q14s16, q15s16);

+            q14s16 = vsubq_s16(q14s16, q15s16);

+            q12s16 = vaddq_s16(q9s16, q12s16);

+            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)

+            // part of stage 7

+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)

+            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)

+            q13s16 = q11s16;

+            q14s16 = q8s16;

+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)

+            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)

+            // part of stage 4

+            q14s16 = vsubq_s16(q5s16, q0s16);

+            q13s16 = vsubq_s16(q6s16, q2s16);

+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);

+            q14s16 = vsubq_s16(q7s16, q1s16);

+            q13s16 = vsubq_s16(q4s16, q3s16);

+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);

+            // part of stage 6

+            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)

+            q8s16 = vaddq_s16(q14s16, q1s16);

+            q9s16 = vaddq_s16(q13s16, q6s16);

+            q13s16 = vsubq_s16(q13s16, q6s16);

+            q1s16 = vsubq_s16(q14s16, q1s16);

+            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)

+            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)

+            q14s16 = vsubq_s16(q8s16, q5s16);

+            q10s16 = vaddq_s16(q8s16, q5s16);

+            q11s16 = vaddq_s16(q9s16, q0s16);

+            q0s16 = vsubq_s16(q9s16, q0s16);

+            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)

+            // part of stage 7

+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)

+            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)

+            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,

+                                                         &q1s16, &q0s16);

+            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)

+            // -----------------------------------------

+            // BLOCK C: 8-10,11-15

+            // -----------------------------------------

+            // generate 8,9,14,15

+            // part of stage 2

+            LOAD_FROM_TRANSPOSED(3, 2, 30)

+            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)

+            LOAD_FROM_TRANSPOSED(30, 18, 14)

+            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)

+            // part of stage 3

+            q13s16 = vsubq_s16(q0s16, q1s16);

+            q0s16 = vaddq_s16(q0s16, q1s16);

+            q14s16 = vsubq_s16(q2s16, q3s16);

+            q2s16 = vaddq_s16(q2s16, q3s16);

+            // part of stage 4

+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)

+            // generate 10,11,12,13

+            // part of stage 2

+            LOAD_FROM_TRANSPOSED(14, 10, 22)

+            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)

+            LOAD_FROM_TRANSPOSED(22, 26, 6)

+            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)

+            // part of stage 3

+            q14s16 = vsubq_s16(q4s16, q5s16);

+            q5s16 = vaddq_s16(q4s16, q5s16);

+            q13s16 = vsubq_s16(q6s16, q7s16);

+            q6s16 = vaddq_s16(q6s16, q7s16);

+            // part of stage 4

+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)

+            // part of stage 5

+            q8s16 = vaddq_s16(q0s16, q5s16);

+            q9s16 = vaddq_s16(q1s16, q7s16);

+            q13s16 = vsubq_s16(q1s16, q7s16);

+            q14s16 = vsubq_s16(q3s16, q4s16);

+            q10s16 = vaddq_s16(q3s16, q4s16);

+            q15s16 = vaddq_s16(q2s16, q6s16);

+            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)

+            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)

+            // part of stage 6

+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)

+            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)

+            q13s16 = vsubq_s16(q0s16, q5s16);

+            q14s16 = vsubq_s16(q2s16, q6s16);

+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)

+            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)

+            // -----------------------------------------

+            // BLOCK D: 0-3,4-7

+            // -----------------------------------------

+            // generate 4,5,6,7

+            // part of stage 3

+            LOAD_FROM_TRANSPOSED(6, 4, 28)

+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)

+            LOAD_FROM_TRANSPOSED(28, 20, 12)

+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)

+            // part of stage 4

+            q13s16 = vsubq_s16(q0s16, q1s16);

+            q0s16 = vaddq_s16(q0s16, q1s16);

+            q14s16 = vsubq_s16(q2s16, q3s16);

+            q2s16 = vaddq_s16(q2s16, q3s16);

+            // part of stage 5

+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)

+            // generate 0,1,2,3

+            // part of stage 4

+            LOAD_FROM_TRANSPOSED(12, 0, 16)

+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)

+            LOAD_FROM_TRANSPOSED(16, 8, 24)

+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)

+            // part of stage 5

+            q4s16 = vaddq_s16(q7s16, q6s16);

+            q7s16 = vsubq_s16(q7s16, q6s16);

+            q6s16 = vsubq_s16(q5s16, q14s16);

+            q5s16 = vaddq_s16(q5s16, q14s16);

+            // part of stage 6

+            q8s16 = vaddq_s16(q4s16, q2s16);

+            q9s16 = vaddq_s16(q5s16, q3s16);

+            q10s16 = vaddq_s16(q6s16, q1s16);

+            q11s16 = vaddq_s16(q7s16, q0s16);

+            q12s16 = vsubq_s16(q7s16, q0s16);

+            q13s16 = vsubq_s16(q6s16, q1s16);

+            q14s16 = vsubq_s16(q5s16, q3s16);

+            q15s16 = vsubq_s16(q4s16, q2s16);

+            // part of stage 7

+            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)

+            q2s16 = vaddq_s16(q8s16, q1s16);

+            q3s16 = vaddq_s16(q9s16, q0s16);

+            q4s16 = vsubq_s16(q9s16, q0s16);

+            q5s16 = vsubq_s16(q8s16, q1s16);

+            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)

+            q8s16 = vaddq_s16(q4s16, q1s16);

+            q9s16 = vaddq_s16(q5s16, q0s16);

+            q6s16 = vsubq_s16(q5s16, q0s16);

+            q7s16 = vsubq_s16(q4s16, q1s16);

+            if (idct32_pass_loop == 0) {

+                idct32_bands_end_1st_pass(out,

+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,

+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);

+            } else {

+                idct32_bands_end_2nd_pass(out, dest, stride,

+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,

+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);

+                dest += 8;

+            }

+        }

+    }

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct4x4_1_add_neon.asm

@@ -1,0 +1,68 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |vp9_idct4x4_1_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,

+;                                  int dest_stride)

+;

+; r0  int16_t input

+; r1  uint8_t *dest

+; r2  int dest_stride)

+|vp9_idct4x4_1_add_neon| PROC

+    ldrsh            r0, [r0]

+    ; generate cospi_16_64 = 11585

+    mov              r12, #0x2d00

+    add              r12, #0x41

+    ; out = dct_const_round_shift(input[0] * cospi_16_64)

+    mul              r0, r0, r12               ; input[0] * cospi_16_64

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; out = dct_const_round_shift(out * cospi_16_64)

+    mul              r0, r0, r12               ; out * cospi_16_64

+    mov              r12, r1                   ; save dest

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; a1 = ROUND_POWER_OF_TWO(out, 4)

+    add              r0, r0, #8                ; + (1 <<((4) - 1))

+    asr              r0, r0, #4                ; >> 4

+    vdup.s16         q0, r0                    ; duplicate a1

+    vld1.32          {d2[0]}, [r1], r2

+    vld1.32          {d2[1]}, [r1], r2

+    vld1.32          {d4[0]}, [r1], r2

+    vld1.32          {d4[1]}, [r1]

+    vaddw.u8         q8, q0, d2                ; dest[x] + a1

+    vaddw.u8         q9, q0, d4

+    vqmovun.s16      d6, q8                    ; clip_pixel

+    vqmovun.s16      d7, q9

+    vst1.32          {d6[0]}, [r12], r2

+    vst1.32          {d6[1]}, [r12], r2

+    vst1.32          {d7[0]}, [r12], r2

+    vst1.32          {d7[1]}, [r12]

+    bx               lr

+    ENDP             ; |vp9_idct4x4_1_add_neon|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct4x4_1_add_neon.c

@@ -1,0 +1,50 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_ports/mem.h"

+void vp9_idct4x4_1_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8x8_t d6u8;

+    uint32x2_t d2u32 = vdup_n_u32(0);

+    uint16x8_t q8u16;

+    int16x8_t q0s16;

+    uint8_t *d1, *d2;

+    int16_t i, a1, cospi_16_64 = 11585;

+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+    out = dct_const_round_shift(out * cospi_16_64);

+    a1 = ROUND_POWER_OF_TWO(out, 4);

+    q0s16 = vdupq_n_s16(a1);

+    // dc_only_idct_add

+    d1 = d2 = dest;

+    for (i = 0; i < 2; i++) {

+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);

+        d1 += dest_stride;

+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);

+        d1 += dest_stride;

+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),

+                         vreinterpret_u8_u32(d2u32));

+        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);

+        d2 += dest_stride;

+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);

+        d2 += dest_stride;

+    }

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct4x4_add_neon.asm

@@ -1,0 +1,190 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_idct4x4_16_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+    AREA     Block, CODE, READONLY ; name this block of code

+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

+;

+; r0  int16_t input

+; r1  uint8_t *dest

+; r2  int dest_stride)

+|vp9_idct4x4_16_add_neon| PROC

+    ; The 2D transform is done with two passes which are actually pretty

+    ; similar. We first transform the rows. This is done by transposing

+    ; the inputs, doing an SIMD column transform (the columns are the

+    ; transposed rows) and then transpose the results (so that it goes back

+    ; in normal/row positions). Then, we transform the columns by doing

+    ; another SIMD column transform.

+    ; So, two passes of a transpose followed by a column transform.

+    ; load the inputs into q8-q9, d16-d19

+    vld1.s16        {q8,q9}, [r0]!

+    ; generate scalar constants

+    ; cospi_8_64 = 15137 = 0x3b21

+    mov             r0, #0x3b00

+    add             r0, #0x21

+    ; cospi_16_64 = 11585 = 0x2d41

+    mov             r3, #0x2d00

+    add             r3, #0x41

+    ; cospi_24_64 = 6270 = 0x 187e

+    mov             r12, #0x1800

+    add             r12, #0x7e

+    ; transpose the input data

+    ; 00 01 02 03   d16

+    ; 10 11 12 13   d17

+    ; 20 21 22 23   d18

+    ; 30 31 32 33   d19

+    vtrn.16         d16, d17

+    vtrn.16         d18, d19

+    ; generate constant vectors

+    vdup.16         d20, r0         ; replicate cospi_8_64

+    vdup.16         d21, r3         ; replicate cospi_16_64

+    ; 00 10 02 12   d16

+    ; 01 11 03 13   d17

+    ; 20 30 22 32   d18

+    ; 21 31 23 33   d19

+    vtrn.32         q8, q9

+    ; 00 10 20 30   d16

+    ; 01 11 21 31   d17

+    ; 02 12 22 32   d18

+    ; 03 13 23 33   d19

+    vdup.16         d22, r12        ; replicate cospi_24_64

+    ; do the transform on transposed rows

+    ; stage 1

+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])

+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])

+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64

+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64

+    ; (input[0] + input[2]) * cospi_16_64;

+    ; (input[0] - input[2]) * cospi_16_64;

+    vmull.s16 q13, d23, d21

+    vmull.s16 q14, d24, d21

+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;

+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;

+    vmlsl.s16 q15, d19, d20

+    vmlal.s16 q1,  d19, d22

+    ; dct_const_round_shift

+    vqrshrn.s32 d26, q13, #14

+    vqrshrn.s32 d27, q14, #14

+    vqrshrn.s32 d29, q15, #14

+    vqrshrn.s32 d28, q1,  #14

+    ; stage 2

+    ; output[0] = step[0] + step[3];

+    ; output[1] = step[1] + step[2];

+    ; output[3] = step[0] - step[3];

+    ; output[2] = step[1] - step[2];

+    vadd.s16 q8,  q13, q14

+    vsub.s16 q9,  q13, q14

+    vswp     d18, d19

+    ; transpose the results

+    ; 00 01 02 03   d16

+    ; 10 11 12 13   d17

+    ; 20 21 22 23   d18

+    ; 30 31 32 33   d19

+    vtrn.16         d16, d17

+    vtrn.16         d18, d19

+    ; 00 10 02 12   d16

+    ; 01 11 03 13   d17

+    ; 20 30 22 32   d18

+    ; 21 31 23 33   d19

+    vtrn.32         q8, q9

+    ; 00 10 20 30   d16

+    ; 01 11 21 31   d17

+    ; 02 12 22 32   d18

+    ; 03 13 23 33   d19

+    ; do the transform on columns

+    ; stage 1

+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])

+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])

+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64

+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64

+    ; (input[0] + input[2]) * cospi_16_64;

+    ; (input[0] - input[2]) * cospi_16_64;

+    vmull.s16 q13, d23, d21

+    vmull.s16 q14, d24, d21

+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;

+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;

+    vmlsl.s16 q15, d19, d20

+    vmlal.s16 q1,  d19, d22

+    ; dct_const_round_shift

+    vqrshrn.s32 d26, q13, #14

+    vqrshrn.s32 d27, q14, #14

+    vqrshrn.s32 d29, q15, #14

+    vqrshrn.s32 d28, q1,  #14

+    ; stage 2

+    ; output[0] = step[0] + step[3];

+    ; output[1] = step[1] + step[2];

+    ; output[3] = step[0] - step[3];

+    ; output[2] = step[1] - step[2];

+    vadd.s16 q8,  q13, q14

+    vsub.s16 q9,  q13, q14

+    ; The results are in two registers, one of them being swapped. This will

+    ; be taken care of by loading the 'dest' value in a swapped fashion and

+    ; also storing them in the same swapped fashion.

+    ; temp_out[0, 1] = d16, d17 = q8

+    ; temp_out[2, 3] = d19, d18 = q9 swapped

+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)

+    vrshr.s16 q8, q8, #4

+    vrshr.s16 q9, q9, #4

+    vld1.32 {d26[0]}, [r1], r2

+    vld1.32 {d26[1]}, [r1], r2

+    vld1.32 {d27[1]}, [r1], r2

+    vld1.32 {d27[0]}, [r1]  ; no post-increment

+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]

+    vaddw.u8 q8, q8, d26

+    vaddw.u8 q9, q9, d27

+    ; clip_pixel

+    vqmovun.s16 d26, q8

+    vqmovun.s16 d27, q9

+    ; do the stores in reverse order with negative post-increment, by changing

+    ; the sign of the stride

+    rsb r2, r2, #0

+    vst1.32 {d27[0]}, [r1], r2

+    vst1.32 {d27[1]}, [r1], r2

+    vst1.32 {d26[1]}, [r1], r2

+    vst1.32 {d26[0]}, [r1]  ; no post-increment

+    bx              lr

+    ENDP  ; |vp9_idct4x4_16_add_neon|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct4x4_add_neon.c

@@ -1,0 +1,151 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+void vp9_idct4x4_16_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8x8_t d26u8, d27u8;

+    uint32x2_t d26u32, d27u32;

+    uint16x8_t q8u16, q9u16;

+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;

+    int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;

+    int16x8_t q8s16, q9s16, q13s16, q14s16;

+    int32x4_t q1s32, q13s32, q14s32, q15s32;

+    int16x4x2_t d0x2s16, d1x2s16;

+    int32x4x2_t q0x2s32;

+    uint8_t *d;

+    int16_t cospi_8_64 = 15137;

+    int16_t cospi_16_64 = 11585;

+    int16_t cospi_24_64 = 6270;

+    d26u32 = d27u32 = vdup_n_u32(0);

+    q8s16 = vld1q_s16(input);

+    q9s16 = vld1q_s16(input + 8);

+    d16s16 = vget_low_s16(q8s16);

+    d17s16 = vget_high_s16(q8s16);

+    d18s16 = vget_low_s16(q9s16);

+    d19s16 = vget_high_s16(q9s16);

+    d0x2s16 = vtrn_s16(d16s16, d17s16);

+    d1x2s16 = vtrn_s16(d18s16, d19s16);

+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);

+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

+    d20s16 = vdup_n_s16(cospi_8_64);

+    d21s16 = vdup_n_s16(cospi_16_64);

+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),

+                        vreinterpretq_s32_s16(q9s16));

+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

+    d22s16 = vdup_n_s16(cospi_24_64);

+    // stage 1

+    d23s16 = vadd_s16(d16s16, d18s16);

+    d24s16 = vsub_s16(d16s16, d18s16);

+    q15s32 = vmull_s16(d17s16, d22s16);

+    q1s32  = vmull_s16(d17s16, d20s16);

+    q13s32 = vmull_s16(d23s16, d21s16);

+    q14s32 = vmull_s16(d24s16, d21s16);

+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);

+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);

+    d26s16 = vqrshrn_n_s32(q13s32, 14);

+    d27s16 = vqrshrn_n_s32(q14s32, 14);

+    d29s16 = vqrshrn_n_s32(q15s32, 14);

+    d28s16 = vqrshrn_n_s32(q1s32,  14);

+    q13s16 = vcombine_s16(d26s16, d27s16);

+    q14s16 = vcombine_s16(d28s16, d29s16);

+    // stage 2

+    q8s16 = vaddq_s16(q13s16, q14s16);

+    q9s16 = vsubq_s16(q13s16, q14s16);

+    d16s16 = vget_low_s16(q8s16);

+    d17s16 = vget_high_s16(q8s16);

+    d18s16 = vget_high_s16(q9s16);  // vswp d18 d19

+    d19s16 = vget_low_s16(q9s16);

+    d0x2s16 = vtrn_s16(d16s16, d17s16);

+    d1x2s16 = vtrn_s16(d18s16, d19s16);

+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);

+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),

+                        vreinterpretq_s32_s16(q9s16));

+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));

+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

+    // do the transform on columns

+    // stage 1

+    d23s16 = vadd_s16(d16s16, d18s16);

+    d24s16 = vsub_s16(d16s16, d18s16);

+    q15s32 = vmull_s16(d17s16, d22s16);

+    q1s32  = vmull_s16(d17s16, d20s16);

+    q13s32 = vmull_s16(d23s16, d21s16);

+    q14s32 = vmull_s16(d24s16, d21s16);

+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);

+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);

+    d26s16 = vqrshrn_n_s32(q13s32, 14);

+    d27s16 = vqrshrn_n_s32(q14s32, 14);

+    d29s16 = vqrshrn_n_s32(q15s32, 14);

+    d28s16 = vqrshrn_n_s32(q1s32,  14);

+    q13s16 = vcombine_s16(d26s16, d27s16);

+    q14s16 = vcombine_s16(d28s16, d29s16);

+    // stage 2

+    q8s16 = vaddq_s16(q13s16, q14s16);

+    q9s16 = vsubq_s16(q13s16, q14s16);

+    q8s16 = vrshrq_n_s16(q8s16, 4);

+    q9s16 = vrshrq_n_s16(q9s16, 4);

+    d = dest;

+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);

+    d += dest_stride;

+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);

+    d += dest_stride;

+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);

+    d += dest_stride;

+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);

+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

+                     vreinterpret_u8_u32(d26u32));

+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

+                     vreinterpret_u8_u32(d27u32));

+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+    d = dest;

+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);

+    d += dest_stride;

+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);

+    d += dest_stride;

+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);

+    d += dest_stride;

+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct8x8_1_add_neon.asm

@@ -1,0 +1,88 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |vp9_idct8x8_1_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,

+;                                  int dest_stride)

+;

+; r0  int16_t input

+; r1  uint8_t *dest

+; r2  int dest_stride)

+|vp9_idct8x8_1_add_neon| PROC

+    ldrsh            r0, [r0]

+    ; generate cospi_16_64 = 11585

+    mov              r12, #0x2d00

+    add              r12, #0x41

+    ; out = dct_const_round_shift(input[0] * cospi_16_64)

+    mul              r0, r0, r12               ; input[0] * cospi_16_64

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; out = dct_const_round_shift(out * cospi_16_64)

+    mul              r0, r0, r12               ; out * cospi_16_64

+    mov              r12, r1                   ; save dest

+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))

+    asr              r0, r0, #14               ; >> DCT_CONST_BITS

+    ; a1 = ROUND_POWER_OF_TWO(out, 5)

+    add              r0, r0, #16               ; + (1 <<((5) - 1))

+    asr              r0, r0, #5                ; >> 5

+    vdup.s16         q0, r0                    ; duplicate a1

+    ; load destination data

+    vld1.64          {d2}, [r1], r2

+    vld1.64          {d3}, [r1], r2

+    vld1.64          {d4}, [r1], r2

+    vld1.64          {d5}, [r1], r2

+    vld1.64          {d6}, [r1], r2

+    vld1.64          {d7}, [r1], r2

+    vld1.64          {d16}, [r1], r2

+    vld1.64          {d17}, [r1]

+    vaddw.u8         q9, q0, d2                ; dest[x] + a1

+    vaddw.u8         q10, q0, d3               ; dest[x] + a1

+    vaddw.u8         q11, q0, d4               ; dest[x] + a1

+    vaddw.u8         q12, q0, d5               ; dest[x] + a1

+    vqmovun.s16      d2, q9                    ; clip_pixel

+    vqmovun.s16      d3, q10                   ; clip_pixel

+    vqmovun.s16      d30, q11                  ; clip_pixel

+    vqmovun.s16      d31, q12                  ; clip_pixel

+    vst1.64          {d2}, [r12], r2

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r2

+    vst1.64          {d31}, [r12], r2

+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1

+    vaddw.u8         q10, q0, d7                ; dest[x] + a1

+    vaddw.u8         q11, q0, d16               ; dest[x] + a1

+    vaddw.u8         q12, q0, d17               ; dest[x] + a1

+    vqmovun.s16      d2, q9                     ; clip_pixel

+    vqmovun.s16      d3, q10                    ; clip_pixel

+    vqmovun.s16      d30, q11                   ; clip_pixel

+    vqmovun.s16      d31, q12                   ; clip_pixel

+    vst1.64          {d2}, [r12], r2

+    vst1.64          {d3}, [r12], r2

+    vst1.64          {d30}, [r12], r2

+    vst1.64          {d31}, [r12], r2

+    bx               lr

+    ENDP             ; |vp9_idct8x8_1_add_neon|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct8x8_1_add_neon.c

@@ -1,0 +1,64 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "vpx_dsp/inv_txfm.h"

+#include "vpx_ports/mem.h"

+void vp9_idct8x8_1_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8x8_t d2u8, d3u8, d30u8, d31u8;

+    uint64x1_t d2u64, d3u64, d4u64, d5u64;

+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;

+    int16x8_t q0s16;

+    uint8_t *d1, *d2;

+    int16_t i, a1, cospi_16_64 = 11585;

+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+    out = dct_const_round_shift(out * cospi_16_64);

+    a1 = ROUND_POWER_OF_TWO(out, 5);

+    q0s16 = vdupq_n_s16(a1);

+    q0u16 = vreinterpretq_u16_s16(q0s16);

+    d1 = d2 = dest;

+    for (i = 0; i < 2; i++) {

+        d2u64 = vld1_u64((const uint64_t *)d1);

+        d1 += dest_stride;

+        d3u64 = vld1_u64((const uint64_t *)d1);

+        d1 += dest_stride;

+        d4u64 = vld1_u64((const uint64_t *)d1);

+        d1 += dest_stride;

+        d5u64 = vld1_u64((const uint64_t *)d1);

+        d1 += dest_stride;

+        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));

+        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));

+        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));

+        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));

+        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

+        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

+        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

+        d2 += dest_stride;

+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

+        d2 += dest_stride;

+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));

+        d2 += dest_stride;

+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));

+        d2 += dest_stride;

+    }

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/arm/idct8x8_add_neon.asm

@@ -1,0 +1,519 @@

+;

+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_idct8x8_64_add_neon|

+    EXPORT  |vp9_idct8x8_12_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are

+    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.

+    ; This macro will touch q0-q7 registers and use them as buffer during

+    ; calculation.

+    MACRO

+    IDCT8x8_1D

+    ; stage 1

+    vdup.16         d0, r3                    ; duplicate cospi_28_64

+    vdup.16         d1, r4                    ; duplicate cospi_4_64

+    vdup.16         d2, r5                    ; duplicate cospi_12_64

+    vdup.16         d3, r6                    ; duplicate cospi_20_64

+    ; input[1] * cospi_28_64

+    vmull.s16       q2, d18, d0

+    vmull.s16       q3, d19, d0

+    ; input[5] * cospi_12_64

+    vmull.s16       q5, d26, d2

+    vmull.s16       q6, d27, d2

+    ; input[1]*cospi_28_64-input[7]*cospi_4_64

+    vmlsl.s16       q2, d30, d1

+    vmlsl.s16       q3, d31, d1

+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64

+    vmlsl.s16       q5, d22, d3

+    vmlsl.s16       q6, d23, d3

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d8, q2, #14               ; >> 14

+    vqrshrn.s32     d9, q3, #14               ; >> 14

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d10, q5, #14              ; >> 14

+    vqrshrn.s32     d11, q6, #14              ; >> 14

+    ; input[1] * cospi_4_64

+    vmull.s16       q2, d18, d1

+    vmull.s16       q3, d19, d1

+    ; input[5] * cospi_20_64

+    vmull.s16       q9, d26, d3

+    vmull.s16       q13, d27, d3

+    ; input[1]*cospi_4_64+input[7]*cospi_28_64

+    vmlal.s16       q2, d30, d0

+    vmlal.s16       q3, d31, d0

+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64

+    vmlal.s16       q9, d22, d2

+    vmlal.s16       q13, d23, d2

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d14, q2, #14              ; >> 14

+    vqrshrn.s32     d15, q3, #14              ; >> 14

+    ; stage 2 & stage 3 - even half

+    vdup.16         d0, r7                    ; duplicate cospi_16_64

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d12, q9, #14              ; >> 14

+    vqrshrn.s32     d13, q13, #14              ; >> 14

+    ; input[0] * cospi_16_64

+    vmull.s16       q2, d16, d0

+    vmull.s16       q3, d17, d0

+    ; input[0] * cospi_16_64

+    vmull.s16       q13, d16, d0

+    vmull.s16       q15, d17, d0

+    ; (input[0] + input[2]) * cospi_16_64

+    vmlal.s16       q2,  d24, d0

+    vmlal.s16       q3, d25, d0

+    ; (input[0] - input[2]) * cospi_16_64

+    vmlsl.s16       q13, d24, d0

+    vmlsl.s16       q15, d25, d0

+    vdup.16         d0, r8                    ; duplicate cospi_24_64

+    vdup.16         d1, r9                    ; duplicate cospi_8_64

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d18, q2, #14              ; >> 14

+    vqrshrn.s32     d19, q3, #14              ; >> 14

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d22, q13, #14              ; >> 14

+    vqrshrn.s32     d23, q15, #14              ; >> 14

+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64

+    ; input[1] * cospi_24_64

+    vmull.s16       q2, d20, d0

+    vmull.s16       q3, d21, d0

+    ; input[1] * cospi_8_64

+    vmull.s16       q8, d20, d1

+    vmull.s16       q12, d21, d1

+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64

+    vmlsl.s16       q2, d28, d1

+    vmlsl.s16       q3, d29, d1

+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64

+    vmlal.s16       q8, d28, d0

+    vmlal.s16       q12, d29, d0

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d26, q2, #14              ; >> 14

+    vqrshrn.s32     d27, q3, #14              ; >> 14

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d30, q8, #14              ; >> 14

+    vqrshrn.s32     d31, q12, #14              ; >> 14

+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]

+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]

+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]

+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]

+    ; stage 3 -odd half

+    vdup.16         d16, r7                   ; duplicate cospi_16_64

+    ; stage 2 - odd half

+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]

+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]

+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]

+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]

+    ; step2[6] * cospi_16_64

+    vmull.s16       q9, d28, d16

+    vmull.s16       q10, d29, d16

+    ; step2[6] * cospi_16_64

+    vmull.s16       q11, d28, d16

+    vmull.s16       q12, d29, d16

+    ; (step2[6] - step2[5]) * cospi_16_64

+    vmlsl.s16       q9, d26, d16

+    vmlsl.s16       q10, d27, d16

+    ; (step2[5] + step2[6]) * cospi_16_64

+    vmlal.s16       q11, d26, d16

+    vmlal.s16       q12, d27, d16

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d10, q9, #14              ; >> 14

+    vqrshrn.s32     d11, q10, #14             ; >> 14

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d12, q11, #14              ; >> 14

+    vqrshrn.s32     d13, q12, #14             ; >> 14

+    ; stage 4

+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];

+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];

+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];

+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];

+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];

+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];

+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];

+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];

+    MEND

+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.

+    MACRO

+    TRANSPOSE8X8

+    vswp            d17, d24

+    vswp            d23, d30

+    vswp            d21, d28

+    vswp            d19, d26

+    vtrn.32         q8, q10

+    vtrn.32         q9, q11

+    vtrn.32         q12, q14

+    vtrn.32         q13, q15

+    vtrn.16         q8, q9

+    vtrn.16         q10, q11

+    vtrn.16         q12, q13

+    vtrn.16         q14, q15

+    MEND

+    AREA    Block, CODE, READONLY ; name this block of code

+;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

+;

+; r0  int16_t input

+; r1  uint8_t *dest

+; r2  int dest_stride)

+|vp9_idct8x8_64_add_neon| PROC

+    push            {r4-r9}

+    vpush           {d8-d15}

+    vld1.s16        {q8,q9}, [r0]!

+    vld1.s16        {q10,q11}, [r0]!

+    vld1.s16        {q12,q13}, [r0]!

+    vld1.s16        {q14,q15}, [r0]!

+    ; transpose the input data

+    TRANSPOSE8X8

+    ; generate  cospi_28_64 = 3196

+    mov             r3, #0x0c00

+    add             r3, #0x7c

+    ; generate cospi_4_64  = 16069

+    mov             r4, #0x3e00

+    add             r4, #0xc5

+    ; generate cospi_12_64 = 13623

+    mov             r5, #0x3500

+    add             r5, #0x37

+    ; generate cospi_20_64 = 9102

+    mov             r6, #0x2300

+    add             r6, #0x8e

+    ; generate cospi_16_64 = 11585

+    mov             r7, #0x2d00

+    add             r7, #0x41

+    ; generate cospi_24_64 = 6270

+    mov             r8, #0x1800

+    add             r8, #0x7e

+    ; generate cospi_8_64 = 15137

+    mov             r9, #0x3b00

+    add             r9, #0x21

+    ; First transform rows

+    IDCT8x8_1D

+    ; Transpose the matrix

+    TRANSPOSE8X8

+    ; Then transform columns

+    IDCT8x8_1D

+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)

+    vrshr.s16       q8, q8, #5

+    vrshr.s16       q9, q9, #5

+    vrshr.s16       q10, q10, #5

+    vrshr.s16       q11, q11, #5

+    vrshr.s16       q12, q12, #5

+    vrshr.s16       q13, q13, #5

+    vrshr.s16       q14, q14, #5

+    vrshr.s16       q15, q15, #5

+    ; save dest pointer

+    mov             r0, r1

+    ; load destination data

+    vld1.64         {d0}, [r1], r2

+    vld1.64         {d1}, [r1], r2

+    vld1.64         {d2}, [r1], r2

+    vld1.64         {d3}, [r1], r2

+    vld1.64         {d4}, [r1], r2

+    vld1.64         {d5}, [r1], r2

+    vld1.64         {d6}, [r1], r2

+    vld1.64         {d7}, [r1]

+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]

+    vaddw.u8        q8, q8, d0

+    vaddw.u8        q9, q9, d1

+    vaddw.u8        q10, q10, d2

+    vaddw.u8        q11, q11, d3

+    vaddw.u8        q12, q12, d4

+    vaddw.u8        q13, q13, d5

+    vaddw.u8        q14, q14, d6

+    vaddw.u8        q15, q15, d7

+    ; clip_pixel

+    vqmovun.s16     d0, q8

+    vqmovun.s16     d1, q9

+    vqmovun.s16     d2, q10

+    vqmovun.s16     d3, q11

+    vqmovun.s16     d4, q12

+    vqmovun.s16     d5, q13

+    vqmovun.s16     d6, q14

+    vqmovun.s16     d7, q15

+    ; store the data

+    vst1.64         {d0}, [r0], r2

+    vst1.64         {d1}, [r0], r2

+    vst1.64         {d2}, [r0], r2

+    vst1.64         {d3}, [r0], r2

+    vst1.64         {d4}, [r0], r2

+    vst1.64         {d5}, [r0], r2

+    vst1.64         {d6}, [r0], r2

+    vst1.64         {d7}, [r0], r2

+    vpop            {d8-d15}

+    pop             {r4-r9}

+    bx              lr

+    ENDP  ; |vp9_idct8x8_64_add_neon|

+;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

+;

+; r0  int16_t input

+; r1  uint8_t *dest

+; r2  int dest_stride)

+|vp9_idct8x8_12_add_neon| PROC

+    push            {r4-r9}

+    vpush           {d8-d15}

+    vld1.s16        {q8,q9}, [r0]!

+    vld1.s16        {q10,q11}, [r0]!

+    vld1.s16        {q12,q13}, [r0]!

+    vld1.s16        {q14,q15}, [r0]!

+    ; transpose the input data

+    TRANSPOSE8X8

+    ; generate  cospi_28_64 = 3196

+    mov             r3, #0x0c00

+    add             r3, #0x7c

+    ; generate cospi_4_64  = 16069

+    mov             r4, #0x3e00

+    add             r4, #0xc5

+    ; generate cospi_12_64 = 13623

+    mov             r5, #0x3500

+    add             r5, #0x37

+    ; generate cospi_20_64 = 9102

+    mov             r6, #0x2300

+    add             r6, #0x8e

+    ; generate cospi_16_64 = 11585

+    mov             r7, #0x2d00

+    add             r7, #0x41

+    ; generate cospi_24_64 = 6270

+    mov             r8, #0x1800

+    add             r8, #0x7e

+    ; generate cospi_8_64 = 15137

+    mov             r9, #0x3b00

+    add             r9, #0x21

+    ; First transform rows

+    ; stage 1

+    ; The following instructions use vqrdmulh to do the

+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling

+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need

+    ; to double the constants before multiplying to compensate this.

+    mov             r12, r3, lsl #1

+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2

+    mov             r12, r4, lsl #1

+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2

+    ; dct_const_round_shift(input[1] * cospi_28_64)

+    vqrdmulh.s16    q4, q9, q0

+    mov             r12, r6, lsl #1

+    rsb             r12, #0

+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2

+    ; dct_const_round_shift(input[1] * cospi_4_64)

+    vqrdmulh.s16    q7, q9, q1

+    mov             r12, r5, lsl #1

+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2

+    ; dct_const_round_shift(- input[3] * cospi_20_64)

+    vqrdmulh.s16    q5, q11, q0

+    mov             r12, r7, lsl #1

+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2

+    ; dct_const_round_shift(input[3] * cospi_12_64)

+    vqrdmulh.s16    q6, q11, q1

+    ; stage 2 & stage 3 - even half

+    mov             r12, r8, lsl #1

+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrdmulh.s16    q9, q8, q0

+    mov             r12, r9, lsl #1

+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2

+    ; dct_const_round_shift(input[1] * cospi_24_64)

+    vqrdmulh.s16    q13, q10, q1

+    ; dct_const_round_shift(input[1] * cospi_8_64)

+    vqrdmulh.s16    q15, q10, q0

+    ; stage 3 -odd half

+    vdup.16         d16, r7                   ; duplicate cospi_16_64

+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]

+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]

+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]

+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]

+    ; stage 2 - odd half

+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]

+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]

+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]

+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]

+    ; step2[6] * cospi_16_64

+    vmull.s16       q9, d28, d16

+    vmull.s16       q10, d29, d16

+    ; step2[6] * cospi_16_64

+    vmull.s16       q11, d28, d16

+    vmull.s16       q12, d29, d16

+    ; (step2[6] - step2[5]) * cospi_16_64

+    vmlsl.s16       q9, d26, d16

+    vmlsl.s16       q10, d27, d16

+    ; (step2[5] + step2[6]) * cospi_16_64

+    vmlal.s16       q11, d26, d16

+    vmlal.s16       q12, d27, d16

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d10, q9, #14              ; >> 14

+    vqrshrn.s32     d11, q10, #14             ; >> 14

+    ; dct_const_round_shift(input_dc * cospi_16_64)

+    vqrshrn.s32     d12, q11, #14              ; >> 14

+    vqrshrn.s32     d13, q12, #14             ; >> 14

+    ; stage 4

+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];

+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];

+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];

+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];

+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];

+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];

+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];

+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];

+    ; Transpose the matrix

+    TRANSPOSE8X8

+    ; Then transform columns

+    IDCT8x8_1D

+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)

+    vrshr.s16       q8, q8, #5

+    vrshr.s16       q9, q9, #5

+    vrshr.s16       q10, q10, #5

+    vrshr.s16       q11, q11, #5

+    vrshr.s16       q12, q12, #5

+    vrshr.s16       q13, q13, #5

+    vrshr.s16       q14, q14, #5

+    vrshr.s16       q15, q15, #5

+    ; save dest pointer

+    mov             r0, r1

+    ; load destination data

+    vld1.64         {d0}, [r1], r2

+    vld1.64         {d1}, [r1], r2

+    vld1.64         {d2}, [r1], r2

+    vld1.64         {d3}, [r1], r2

+    vld1.64         {d4}, [r1], r2

+    vld1.64         {d5}, [r1], r2

+    vld1.64         {d6}, [r1], r2

+    vld1.64         {d7}, [r1]

+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]

+    vaddw.u8        q8, q8, d0

+    vaddw.u8        q9, q9, d1

+    vaddw.u8        q10, q10, d2

+    vaddw.u8        q11, q11, d3

+    vaddw.u8        q12, q12, d4

+    vaddw.u8        q13, q13, d5

+    vaddw.u8        q14, q14, d6

+    vaddw.u8        q15, q15, d7

+    ; clip_pixel

+    vqmovun.s16     d0, q8

+    vqmovun.s16     d1, q9

+    vqmovun.s16     d2, q10

+    vqmovun.s16     d3, q11

+    vqmovun.s16     d4, q12

+    vqmovun.s16     d5, q13

+    vqmovun.s16     d6, q14

+    vqmovun.s16     d7, q15

+    ; store the data

+    vst1.64         {d0}, [r0], r2

+    vst1.64         {d1}, [r0], r2

+    vst1.64         {d2}, [r0], r2

+    vst1.64         {d3}, [r0], r2

+    vst1.64         {d4}, [r0], r2

+    vst1.64         {d5}, [r0], r2

+    vst1.64         {d6}, [r0], r2

+    vst1.64         {d7}, [r0], r2

+    vpop            {d8-d15}

+    pop             {r4-r9}

+    bx              lr

+    ENDP  ; |vp9_idct8x8_12_add_neon|

+    END

--- /dev/null

+++ b/vpx_dsp/arm/idct8x8_add_neon.c

@@ -1,0 +1,540 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_config.h"

+#include "vpx_dsp/txfm_common.h"

+static INLINE void TRANSPOSE8X8(

+        int16x8_t *q8s16,

+        int16x8_t *q9s16,

+        int16x8_t *q10s16,

+        int16x8_t *q11s16,

+        int16x8_t *q12s16,

+        int16x8_t *q13s16,

+        int16x8_t *q14s16,

+        int16x8_t *q15s16) {

+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;

+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;

+    d16s16 = vget_low_s16(*q8s16);

+    d17s16 = vget_high_s16(*q8s16);

+    d18s16 = vget_low_s16(*q9s16);

+    d19s16 = vget_high_s16(*q9s16);

+    d20s16 = vget_low_s16(*q10s16);

+    d21s16 = vget_high_s16(*q10s16);

+    d22s16 = vget_low_s16(*q11s16);

+    d23s16 = vget_high_s16(*q11s16);

+    d24s16 = vget_low_s16(*q12s16);

+    d25s16 = vget_high_s16(*q12s16);

+    d26s16 = vget_low_s16(*q13s16);

+    d27s16 = vget_high_s16(*q13s16);

+    d28s16 = vget_low_s16(*q14s16);

+    d29s16 = vget_high_s16(*q14s16);

+    d30s16 = vget_low_s16(*q15s16);

+    d31s16 = vget_high_s16(*q15s16);

+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24

+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26

+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28

+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30

+    *q12s16 = vcombine_s16(d17s16, d25s16);

+    *q13s16 = vcombine_s16(d19s16, d27s16);

+    *q14s16 = vcombine_s16(d21s16, d29s16);

+    *q15s16 = vcombine_s16(d23s16, d31s16);

+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),

+                        vreinterpretq_s32_s16(*q10s16));

+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),

+                        vreinterpretq_s32_s16(*q11s16));

+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),

+                        vreinterpretq_s32_s16(*q14s16));

+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),

+                        vreinterpretq_s32_s16(*q15s16));

+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8

+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9

+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10

+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11

+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12

+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13

+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14

+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15

+    *q8s16  = q0x2s16.val[0];

+    *q9s16  = q0x2s16.val[1];

+    *q10s16 = q1x2s16.val[0];

+    *q11s16 = q1x2s16.val[1];

+    *q12s16 = q2x2s16.val[0];

+    *q13s16 = q2x2s16.val[1];

+    *q14s16 = q3x2s16.val[0];

+    *q15s16 = q3x2s16.val[1];

+    return;

+}

+static INLINE void IDCT8x8_1D(

+        int16x8_t *q8s16,

+        int16x8_t *q9s16,

+        int16x8_t *q10s16,

+        int16x8_t *q11s16,

+        int16x8_t *q12s16,

+        int16x8_t *q13s16,

+        int16x8_t *q14s16,

+        int16x8_t *q15s16) {

+    int16x4_t d0s16, d1s16, d2s16, d3s16;

+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;

+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;

+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;

+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;

+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;

+    d0s16 = vdup_n_s16(cospi_28_64);

+    d1s16 = vdup_n_s16(cospi_4_64);

+    d2s16 = vdup_n_s16(cospi_12_64);

+    d3s16 = vdup_n_s16(cospi_20_64);

+    d16s16 = vget_low_s16(*q8s16);

+    d17s16 = vget_high_s16(*q8s16);

+    d18s16 = vget_low_s16(*q9s16);

+    d19s16 = vget_high_s16(*q9s16);

+    d20s16 = vget_low_s16(*q10s16);

+    d21s16 = vget_high_s16(*q10s16);

+    d22s16 = vget_low_s16(*q11s16);

+    d23s16 = vget_high_s16(*q11s16);

+    d24s16 = vget_low_s16(*q12s16);

+    d25s16 = vget_high_s16(*q12s16);

+    d26s16 = vget_low_s16(*q13s16);

+    d27s16 = vget_high_s16(*q13s16);

+    d28s16 = vget_low_s16(*q14s16);

+    d29s16 = vget_high_s16(*q14s16);

+    d30s16 = vget_low_s16(*q15s16);

+    d31s16 = vget_high_s16(*q15s16);

+    q2s32 = vmull_s16(d18s16, d0s16);

+    q3s32 = vmull_s16(d19s16, d0s16);

+    q5s32 = vmull_s16(d26s16, d2s16);

+    q6s32 = vmull_s16(d27s16, d2s16);

+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);

+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);

+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);

+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);

+    d8s16 = vqrshrn_n_s32(q2s32, 14);

+    d9s16 = vqrshrn_n_s32(q3s32, 14);

+    d10s16 = vqrshrn_n_s32(q5s32, 14);

+    d11s16 = vqrshrn_n_s32(q6s32, 14);

+    q4s16 = vcombine_s16(d8s16, d9s16);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q2s32 = vmull_s16(d18s16, d1s16);

+    q3s32 = vmull_s16(d19s16, d1s16);

+    q9s32 = vmull_s16(d26s16, d3s16);

+    q13s32 = vmull_s16(d27s16, d3s16);

+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);

+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);

+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);

+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);

+    d14s16 = vqrshrn_n_s32(q2s32, 14);

+    d15s16 = vqrshrn_n_s32(q3s32, 14);

+    d12s16 = vqrshrn_n_s32(q9s32, 14);

+    d13s16 = vqrshrn_n_s32(q13s32, 14);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    q7s16 = vcombine_s16(d14s16, d15s16);

+    d0s16 = vdup_n_s16(cospi_16_64);

+    q2s32 = vmull_s16(d16s16, d0s16);

+    q3s32 = vmull_s16(d17s16, d0s16);

+    q13s32 = vmull_s16(d16s16, d0s16);

+    q15s32 = vmull_s16(d17s16, d0s16);

+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);

+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);

+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);

+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);

+    d0s16 = vdup_n_s16(cospi_24_64);

+    d1s16 = vdup_n_s16(cospi_8_64);

+    d18s16 = vqrshrn_n_s32(q2s32, 14);

+    d19s16 = vqrshrn_n_s32(q3s32, 14);

+    d22s16 = vqrshrn_n_s32(q13s32, 14);

+    d23s16 = vqrshrn_n_s32(q15s32, 14);

+    *q9s16 = vcombine_s16(d18s16, d19s16);

+    *q11s16 = vcombine_s16(d22s16, d23s16);

+    q2s32 = vmull_s16(d20s16, d0s16);

+    q3s32 = vmull_s16(d21s16, d0s16);

+    q8s32 = vmull_s16(d20s16, d1s16);

+    q12s32 = vmull_s16(d21s16, d1s16);

+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);

+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);

+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);

+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);

+    d26s16 = vqrshrn_n_s32(q2s32, 14);

+    d27s16 = vqrshrn_n_s32(q3s32, 14);

+    d30s16 = vqrshrn_n_s32(q8s32, 14);

+    d31s16 = vqrshrn_n_s32(q12s32, 14);

+    *q13s16 = vcombine_s16(d26s16, d27s16);

+    *q15s16 = vcombine_s16(d30s16, d31s16);

+    q0s16 = vaddq_s16(*q9s16, *q15s16);

+    q1s16 = vaddq_s16(*q11s16, *q13s16);

+    q2s16 = vsubq_s16(*q11s16, *q13s16);

+    q3s16 = vsubq_s16(*q9s16, *q15s16);

+    *q13s16 = vsubq_s16(q4s16, q5s16);

+    q4s16 = vaddq_s16(q4s16, q5s16);

+    *q14s16 = vsubq_s16(q7s16, q6s16);

+    q7s16 = vaddq_s16(q7s16, q6s16);

+    d26s16 = vget_low_s16(*q13s16);

+    d27s16 = vget_high_s16(*q13s16);

+    d28s16 = vget_low_s16(*q14s16);

+    d29s16 = vget_high_s16(*q14s16);

+    d16s16 = vdup_n_s16(cospi_16_64);

+    q9s32 = vmull_s16(d28s16, d16s16);

+    q10s32 = vmull_s16(d29s16, d16s16);

+    q11s32 = vmull_s16(d28s16, d16s16);

+    q12s32 = vmull_s16(d29s16, d16s16);

+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);

+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);

+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);

+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

+    d10s16 = vqrshrn_n_s32(q9s32, 14);

+    d11s16 = vqrshrn_n_s32(q10s32, 14);

+    d12s16 = vqrshrn_n_s32(q11s32, 14);

+    d13s16 = vqrshrn_n_s32(q12s32, 14);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    *q8s16 = vaddq_s16(q0s16, q7s16);

+    *q9s16 = vaddq_s16(q1s16, q6s16);

+    *q10s16 = vaddq_s16(q2s16, q5s16);

+    *q11s16 = vaddq_s16(q3s16, q4s16);

+    *q12s16 = vsubq_s16(q3s16, q4s16);

+    *q13s16 = vsubq_s16(q2s16, q5s16);

+    *q14s16 = vsubq_s16(q1s16, q6s16);

+    *q15s16 = vsubq_s16(q0s16, q7s16);

+    return;

+}

+void vp9_idct8x8_64_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8_t *d1, *d2;

+    uint8x8_t d0u8, d1u8, d2u8, d3u8;

+    uint64x1_t d0u64, d1u64, d2u64, d3u64;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    uint16x8_t q8u16, q9u16, q10u16, q11u16;

+    q8s16 = vld1q_s16(input);

+    q9s16 = vld1q_s16(input + 8);

+    q10s16 = vld1q_s16(input + 16);

+    q11s16 = vld1q_s16(input + 24);

+    q12s16 = vld1q_s16(input + 32);

+    q13s16 = vld1q_s16(input + 40);

+    q14s16 = vld1q_s16(input + 48);

+    q15s16 = vld1q_s16(input + 56);

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,

+               &q12s16, &q13s16, &q14s16, &q15s16);

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,

+               &q12s16, &q13s16, &q14s16, &q15s16);

+    q8s16 = vrshrq_n_s16(q8s16, 5);

+    q9s16 = vrshrq_n_s16(q9s16, 5);

+    q10s16 = vrshrq_n_s16(q10s16, 5);

+    q11s16 = vrshrq_n_s16(q11s16, 5);

+    q12s16 = vrshrq_n_s16(q12s16, 5);

+    q13s16 = vrshrq_n_s16(q13s16, 5);

+    q14s16 = vrshrq_n_s16(q14s16, 5);

+    q15s16 = vrshrq_n_s16(q15s16, 5);

+    d1 = d2 = dest;

+    d0u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d1u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d2u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d3u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

+                     vreinterpret_u8_u64(d0u64));

+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

+                     vreinterpret_u8_u64(d1u64));

+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

+                      vreinterpret_u8_u64(d2u64));

+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

+                      vreinterpret_u8_u64(d3u64));

+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

+    d2 += dest_stride;

+    q8s16 = q12s16;

+    q9s16 = q13s16;

+    q10s16 = q14s16;

+    q11s16 = q15s16;

+    d0u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d1u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d2u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d3u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

+                     vreinterpret_u8_u64(d0u64));

+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

+                     vreinterpret_u8_u64(d1u64));

+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

+                      vreinterpret_u8_u64(d2u64));

+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

+                      vreinterpret_u8_u64(d3u64));

+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

+    d2 += dest_stride;

+    return;

+}

+void vp9_idct8x8_12_add_neon(

+        int16_t *input,

+        uint8_t *dest,

+        int dest_stride) {

+    uint8_t *d1, *d2;

+    uint8x8_t d0u8, d1u8, d2u8, d3u8;

+    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;

+    int16x4_t d26s16, d27s16, d28s16, d29s16;

+    uint64x1_t d0u64, d1u64, d2u64, d3u64;

+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;

+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

+    uint16x8_t q8u16, q9u16, q10u16, q11u16;

+    int32x4_t q9s32, q10s32, q11s32, q12s32;

+    q8s16 = vld1q_s16(input);

+    q9s16 = vld1q_s16(input + 8);

+    q10s16 = vld1q_s16(input + 16);

+    q11s16 = vld1q_s16(input + 24);

+    q12s16 = vld1q_s16(input + 32);

+    q13s16 = vld1q_s16(input + 40);

+    q14s16 = vld1q_s16(input + 48);

+    q15s16 = vld1q_s16(input + 56);

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    // First transform rows

+    // stage 1

+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);

+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);

+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);

+    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);

+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);

+    q1s16 = vdupq_n_s16(cospi_12_64 * 2);

+    q5s16 = vqrdmulhq_s16(q11s16, q0s16);

+    q0s16 = vdupq_n_s16(cospi_16_64 * 2);

+    q6s16 = vqrdmulhq_s16(q11s16, q1s16);

+    // stage 2 & stage 3 - even half

+    q1s16 = vdupq_n_s16(cospi_24_64 * 2);

+    q9s16 = vqrdmulhq_s16(q8s16, q0s16);

+    q0s16 = vdupq_n_s16(cospi_8_64 * 2);

+    q13s16 = vqrdmulhq_s16(q10s16, q1s16);

+    q15s16 = vqrdmulhq_s16(q10s16, q0s16);

+    // stage 3 -odd half

+    q0s16 = vaddq_s16(q9s16, q15s16);

+    q1s16 = vaddq_s16(q9s16, q13s16);

+    q2s16 = vsubq_s16(q9s16, q13s16);

+    q3s16 = vsubq_s16(q9s16, q15s16);

+    // stage 2 - odd half

+    q13s16 = vsubq_s16(q4s16, q5s16);

+    q4s16 = vaddq_s16(q4s16, q5s16);

+    q14s16 = vsubq_s16(q7s16, q6s16);

+    q7s16 = vaddq_s16(q7s16, q6s16);

+    d26s16 = vget_low_s16(q13s16);

+    d27s16 = vget_high_s16(q13s16);

+    d28s16 = vget_low_s16(q14s16);

+    d29s16 = vget_high_s16(q14s16);

+    d16s16 = vdup_n_s16(cospi_16_64);

+    q9s32 = vmull_s16(d28s16, d16s16);

+    q10s32 = vmull_s16(d29s16, d16s16);

+    q11s32 = vmull_s16(d28s16, d16s16);

+    q12s32 = vmull_s16(d29s16, d16s16);

+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);

+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);

+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);

+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

+    d10s16 = vqrshrn_n_s32(q9s32, 14);

+    d11s16 = vqrshrn_n_s32(q10s32, 14);

+    d12s16 = vqrshrn_n_s32(q11s32, 14);

+    d13s16 = vqrshrn_n_s32(q12s32, 14);

+    q5s16 = vcombine_s16(d10s16, d11s16);

+    q6s16 = vcombine_s16(d12s16, d13s16);

+    // stage 4

+    q8s16 = vaddq_s16(q0s16, q7s16);

+    q9s16 = vaddq_s16(q1s16, q6s16);

+    q10s16 = vaddq_s16(q2s16, q5s16);

+    q11s16 = vaddq_s16(q3s16, q4s16);

+    q12s16 = vsubq_s16(q3s16, q4s16);

+    q13s16 = vsubq_s16(q2s16, q5s16);

+    q14s16 = vsubq_s16(q1s16, q6s16);

+    q15s16 = vsubq_s16(q0s16, q7s16);

+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,

+                 &q12s16, &q13s16, &q14s16, &q15s16);

+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,

+               &q12s16, &q13s16, &q14s16, &q15s16);

+    q8s16 = vrshrq_n_s16(q8s16, 5);

+    q9s16 = vrshrq_n_s16(q9s16, 5);

+    q10s16 = vrshrq_n_s16(q10s16, 5);

+    q11s16 = vrshrq_n_s16(q11s16, 5);

+    q12s16 = vrshrq_n_s16(q12s16, 5);

+    q13s16 = vrshrq_n_s16(q13s16, 5);

+    q14s16 = vrshrq_n_s16(q14s16, 5);

+    q15s16 = vrshrq_n_s16(q15s16, 5);

+    d1 = d2 = dest;

+    d0u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d1u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d2u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d3u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

+                     vreinterpret_u8_u64(d0u64));

+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

+                     vreinterpret_u8_u64(d1u64));

+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

+                      vreinterpret_u8_u64(d2u64));

+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

+                      vreinterpret_u8_u64(d3u64));

+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

+    d2 += dest_stride;

+    q8s16 = q12s16;

+    q9s16 = q13s16;

+    q10s16 = q14s16;

+    q11s16 = q15s16;

+    d0u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d1u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d2u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    d3u64 = vld1_u64((uint64_t *)d1);

+    d1 += dest_stride;

+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),

+                     vreinterpret_u8_u64(d0u64));

+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),

+                     vreinterpret_u8_u64(d1u64));

+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),

+                      vreinterpret_u8_u64(d2u64));

+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),

+                      vreinterpret_u8_u64(d3u64));

+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));

+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));

+    d2 += dest_stride;

+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));

+    d2 += dest_stride;

+    return;

+}

--- /dev/null

+++ b/vpx_dsp/inv_txfm.c

@@ -1,0 +1,2476 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <math.h>

+#include <string.h>

+#include "vpx_dsp/inv_txfm.h"

+void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

+   0.5 shifts per pixel. */

+  int i;

+  tran_low_t output[16];

+  tran_high_t a1, b1, c1, d1, e1;

+  const tran_low_t *ip = input;

+  tran_low_t *op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] >> UNIT_QUANT_SHIFT;

+    c1 = ip[1] >> UNIT_QUANT_SHIFT;

+    d1 = ip[2] >> UNIT_QUANT_SHIFT;

+    b1 = ip[3] >> UNIT_QUANT_SHIFT;

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    op[0] = WRAPLOW(a1, 8);

+    op[1] = WRAPLOW(b1, 8);

+    op[2] = WRAPLOW(c1, 8);

+    op[3] = WRAPLOW(d1, 8);

+    ip += 4;

+    op += 4;

+  }

+  ip = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[4 * 0];

+    c1 = ip[4 * 1];

+    d1 = ip[4 * 2];

+    b1 = ip[4 * 3];

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);

+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);

+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);

+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);

+    ip++;

+    dest++;

+  }

+}

+void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {

+  int i;

+  tran_high_t a1, e1;

+  tran_low_t tmp[4];

+  const tran_low_t *ip = in;

+  tran_low_t *op = tmp;

+  a1 = ip[0] >> UNIT_QUANT_SHIFT;

+  e1 = a1 >> 1;

+  a1 -= e1;

+  op[0] = WRAPLOW(a1, 8);

+  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);

+  ip = tmp;

+  for (i = 0; i < 4; i++) {

+    e1 = ip[0] >> 1;

+    a1 = ip[0] - e1;

+    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);

+    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);

+    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);

+    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);

+    ip++;

+    dest++;

+  }

+}

+void idct4_c(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step[4];

+  tran_high_t temp1, temp2;

+  // stage 1

+  temp1 = (input[0] + input[2]) * cospi_16_64;

+  temp2 = (input[0] - input[2]) * cospi_16_64;

+  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

+  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  // stage 2

+  output[0] = WRAPLOW(step[0] + step[3], 8);

+  output[1] = WRAPLOW(step[1] + step[2], 8);

+  output[2] = WRAPLOW(step[1] - step[2], 8);

+  output[3] = WRAPLOW(step[0] - step[3], 8);

+}

+void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[4], temp_out[4];

+  // Rows

+  for (i = 0; i < 4; ++i) {

+    idct4_c(input, outptr);

+    input += 4;

+    outptr += 4;

+  }

+  // Columns

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    idct4_c(temp_in, temp_out);

+    for (j = 0; j < 4; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));

+    }

+  }

+}

+void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,

+                         int dest_stride) {

+  int i;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  for (i = 0; i < 4; i++) {

+    dest[0] = clip_pixel_add(dest[0], a1);

+    dest[1] = clip_pixel_add(dest[1], a1);

+    dest[2] = clip_pixel_add(dest[2], a1);

+    dest[3] = clip_pixel_add(dest[3], a1);

+    dest += dest_stride;

+  }

+}

+void idct8_c(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step1[8], step2[8];

+  tran_high_t temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[2] = input[4];

+  step1[1] = input[2];

+  step1[3] = input[6];

+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  // stage 2 & stage 3 - even half

+  idct4_c(step1, step1);

+  // stage 2 - odd half

+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);

+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);

+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);

+  // stage 3 -odd half

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[7] = step2[7];

+  // stage 4

+  output[0] = WRAPLOW(step1[0] + step1[7], 8);

+  output[1] = WRAPLOW(step1[1] + step1[6], 8);

+  output[2] = WRAPLOW(step1[2] + step1[5], 8);

+  output[3] = WRAPLOW(step1[3] + step1[4], 8);

+  output[4] = WRAPLOW(step1[3] - step1[4], 8);

+  output[5] = WRAPLOW(step1[2] - step1[5], 8);

+  output[6] = WRAPLOW(step1[1] - step1[6], 8);

+  output[7] = WRAPLOW(step1[0] - step1[7], 8);

+}

+void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[8], temp_out[8];

+  // First transform rows

+  for (i = 0; i < 8; ++i) {

+    idct8_c(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    idct8_c(temp_in, temp_out);

+    for (j = 0; j < 8; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));

+    }

+  }

+}

+void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  int i, j;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

+  a1 = ROUND_POWER_OF_TWO(out, 5);

+  for (j = 0; j < 8; ++j) {

+    for (i = 0; i < 8; ++i)

+      dest[i] = clip_pixel_add(dest[i], a1);

+    dest += stride;

+  }

+}

+void iadst4_c(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_low_t x0 = input[0];

+  tran_low_t x1 = input[1];

+  tran_low_t x2 = input[2];

+  tran_low_t x3 = input[3];

+  if (!(x0 | x1 | x2 | x3)) {

+    output[0] = output[1] = output[2] = output[3] = 0;

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = x0 - x2 + x3;

+  s0 = s0 + s3 + s5;

+  s1 = s1 - s4 - s6;

+  s3 = s2;

+  s2 = sinpi_3_9 * s7;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);

+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);

+  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);

+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);

+}

+void iadst8_c(const tran_low_t *input, tran_low_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_high_t x0 = input[7];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[5];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[3];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[1];

+  tran_high_t x7 = input[6];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = 0;

+    return;

+  }

+  // stage 1

+  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);

+  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);

+  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);

+  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);

+  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);

+  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);

+  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);

+  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);

+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);

+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);

+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);

+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);

+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);

+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);

+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);

+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);

+  // stage 2

+  s0 = (int)x0;

+  s1 = (int)x1;

+  s2 = (int)x2;

+  s3 = (int)x3;

+  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);

+  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);

+  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);

+  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);

+  x0 = WRAPLOW(s0 + s2, 8);

+  x1 = WRAPLOW(s1 + s3, 8);

+  x2 = WRAPLOW(s0 - s2, 8);

+  x3 = WRAPLOW(s1 - s3, 8);

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);

+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);

+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);

+  // stage 3

+  s2 = (int)(cospi_16_64 * (x2 + x3));

+  s3 = (int)(cospi_16_64 * (x2 - x3));

+  s6 = (int)(cospi_16_64 * (x6 + x7));

+  s7 = (int)(cospi_16_64 * (x6 - x7));

+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);

+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);

+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);

+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);

+  output[0] = WRAPLOW(x0, 8);

+  output[1] = WRAPLOW(-x4, 8);

+  output[2] = WRAPLOW(x6, 8);

+  output[3] = WRAPLOW(-x2, 8);

+  output[4] = WRAPLOW(x3, 8);

+  output[5] = WRAPLOW(-x7, 8);

+  output[6] = WRAPLOW(x5, 8);

+  output[7] = WRAPLOW(-x1, 8);

+}

+void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[8], temp_out[8];

+  // First transform rows

+  // only first 4 row has non-zero coefs

+  for (i = 0; i < 4; ++i) {

+    idct8_c(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    idct8_c(temp_in, temp_out);

+    for (j = 0; j < 8; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));

+    }

+  }

+}

+void idct16_c(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step1[16], step2[16];

+  tran_high_t temp1, temp2;

+  // stage 1

+  step1[0] = input[0/2];

+  step1[1] = input[16/2];

+  step1[2] = input[8/2];

+  step1[3] = input[24/2];

+  step1[4] = input[4/2];

+  step1[5] = input[20/2];

+  step1[6] = input[12/2];

+  step1[7] = input[28/2];

+  step1[8] = input[2/2];

+  step1[9] = input[18/2];

+  step1[10] = input[10/2];

+  step1[11] = input[26/2];

+  step1[12] = input[6/2];

+  step1[13] = input[22/2];

+  step1[14] = input[14/2];

+  step1[15] = input[30/2];

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);

+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);

+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);

+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);

+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);

+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);

+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);

+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);

+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);

+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  // stage 5

+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);

+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);

+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);

+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[7] = step2[7];

+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);

+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);

+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);

+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);

+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);

+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);

+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);

+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);

+  // stage 6

+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);

+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);

+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);

+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);

+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);

+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);

+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);

+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  // stage 7

+  output[0] = WRAPLOW(step2[0] + step2[15], 8);

+  output[1] = WRAPLOW(step2[1] + step2[14], 8);

+  output[2] = WRAPLOW(step2[2] + step2[13], 8);

+  output[3] = WRAPLOW(step2[3] + step2[12], 8);

+  output[4] = WRAPLOW(step2[4] + step2[11], 8);

+  output[5] = WRAPLOW(step2[5] + step2[10], 8);

+  output[6] = WRAPLOW(step2[6] + step2[9], 8);

+  output[7] = WRAPLOW(step2[7] + step2[8], 8);

+  output[8] = WRAPLOW(step2[7] - step2[8], 8);

+  output[9] = WRAPLOW(step2[6] - step2[9], 8);

+  output[10] = WRAPLOW(step2[5] - step2[10], 8);

+  output[11] = WRAPLOW(step2[4] - step2[11], 8);

+  output[12] = WRAPLOW(step2[3] - step2[12], 8);

+  output[13] = WRAPLOW(step2[2] - step2[13], 8);

+  output[14] = WRAPLOW(step2[1] - step2[14], 8);

+  output[15] = WRAPLOW(step2[0] - step2[15], 8);

+}

+void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,

+                             int stride) {

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[16], temp_out[16];

+  // First transform rows

+  for (i = 0; i < 16; ++i) {

+    idct16_c(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    idct16_c(temp_in, temp_out);

+    for (j = 0; j < 16; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

+    }

+  }

+}

+void iadst16_c(const tran_low_t *input, tran_low_t *output) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

+  tran_high_t s9, s10, s11, s12, s13, s14, s15;

+  tran_high_t x0 = input[15];

+  tran_high_t x1 = input[0];

+  tran_high_t x2 = input[13];

+  tran_high_t x3 = input[2];

+  tran_high_t x4 = input[11];

+  tran_high_t x5 = input[4];

+  tran_high_t x6 = input[9];

+  tran_high_t x7 = input[6];

+  tran_high_t x8 = input[7];

+  tran_high_t x9 = input[8];

+  tran_high_t x10 = input[5];

+  tran_high_t x11 = input[10];

+  tran_high_t x12 = input[3];

+  tran_high_t x13 = input[12];

+  tran_high_t x14 = input[1];

+  tran_high_t x15 = input[14];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = output[8]

+              = output[9] = output[10] = output[11] = output[12]

+              = output[13] = output[14] = output[15] = 0;

+    return;

+  }

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);

+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);

+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);

+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);

+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);

+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);

+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);

+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);

+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);

+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);

+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);

+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);

+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);

+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = WRAPLOW(s0 + s4, 8);

+  x1 = WRAPLOW(s1 + s5, 8);

+  x2 = WRAPLOW(s2 + s6, 8);

+  x3 = WRAPLOW(s3 + s7, 8);

+  x4 = WRAPLOW(s0 - s4, 8);

+  x5 = WRAPLOW(s1 - s5, 8);

+  x6 = WRAPLOW(s2 - s6, 8);

+  x7 = WRAPLOW(s3 - s7, 8);

+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);

+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);

+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);

+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);

+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);

+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);

+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);

+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

+  x0 = WRAPLOW(check_range(s0 + s2), 8);

+  x1 = WRAPLOW(check_range(s1 + s3), 8);

+  x2 = WRAPLOW(check_range(s0 - s2), 8);

+  x3 = WRAPLOW(check_range(s1 - s3), 8);

+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);

+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);

+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);

+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);

+  x8 = WRAPLOW(check_range(s8 + s10), 8);

+  x9 = WRAPLOW(check_range(s9 + s11), 8);

+  x10 = WRAPLOW(check_range(s8 - s10), 8);

+  x11 = WRAPLOW(check_range(s9 - s11), 8);

+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);

+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);

+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);

+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (- x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (- x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);

+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);

+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);

+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);

+  x10 = WRAPLOW(dct_const_round_shift(s10), 8);

+  x11 = WRAPLOW(dct_const_round_shift(s11), 8);

+  x14 = WRAPLOW(dct_const_round_shift(s14), 8);

+  x15 = WRAPLOW(dct_const_round_shift(s15), 8);

+  output[0] = WRAPLOW(x0, 8);

+  output[1] = WRAPLOW(-x8, 8);

+  output[2] = WRAPLOW(x12, 8);

+  output[3] = WRAPLOW(-x4, 8);

+  output[4] = WRAPLOW(x6, 8);

+  output[5] = WRAPLOW(x14, 8);

+  output[6] = WRAPLOW(x10, 8);

+  output[7] = WRAPLOW(x2, 8);

+  output[8] = WRAPLOW(x3, 8);

+  output[9] = WRAPLOW(x11, 8);

+  output[10] = WRAPLOW(x15, 8);

+  output[11] = WRAPLOW(x7, 8);

+  output[12] = WRAPLOW(x5, 8);

+  output[13] = WRAPLOW(-x13, 8);

+  output[14] = WRAPLOW(x9, 8);

+  output[15] = WRAPLOW(-x1, 8);

+}

+void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,

+                            int stride) {

+  tran_low_t out[16 * 16] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[16], temp_out[16];

+  // First transform rows. Since all non-zero dct coefficients are in

+  // upper-left 4x4 area, we only need to calculate first 4 rows here.

+  for (i = 0; i < 4; ++i) {

+    idct16_c(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j*16 + i];

+    idct16_c(temp_in, temp_out);

+    for (j = 0; j < 16; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

+    }

+  }

+}

+void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  int i, j;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

+  a1 = ROUND_POWER_OF_TWO(out, 6);

+  for (j = 0; j < 16; ++j) {

+    for (i = 0; i < 16; ++i)

+      dest[i] = clip_pixel_add(dest[i], a1);

+    dest += stride;

+  }

+}

+void idct32_c(const tran_low_t *input, tran_low_t *output) {

+  tran_low_t step1[32], step2[32];

+  tran_high_t temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[1] = input[16];

+  step1[2] = input[8];

+  step1[3] = input[24];

+  step1[4] = input[4];

+  step1[5] = input[20];

+  step1[6] = input[12];

+  step1[7] = input[28];

+  step1[8] = input[2];

+  step1[9] = input[18];

+  step1[10] = input[10];

+  step1[11] = input[26];

+  step1[12] = input[6];

+  step1[13] = input[22];

+  step1[14] = input[14];

+  step1[15] = input[30];

+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

+  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step2[16] = WRAPLOW(step1[16] + step1[17], 8);

+  step2[17] = WRAPLOW(step1[16] - step1[17], 8);

+  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);

+  step2[19] = WRAPLOW(step1[18] + step1[19], 8);

+  step2[20] = WRAPLOW(step1[20] + step1[21], 8);

+  step2[21] = WRAPLOW(step1[20] - step1[21], 8);

+  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);

+  step2[23] = WRAPLOW(step1[22] + step1[23], 8);

+  step2[24] = WRAPLOW(step1[24] + step1[25], 8);

+  step2[25] = WRAPLOW(step1[24] - step1[25], 8);

+  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);

+  step2[27] = WRAPLOW(step1[26] + step1[27], 8);

+  step2[28] = WRAPLOW(step1[28] + step1[29], 8);

+  step2[29] = WRAPLOW(step1[28] - step1[29], 8);

+  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);

+  step2[31] = WRAPLOW(step1[30] + step1[31], 8);

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);

+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);

+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);

+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);

+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);

+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);

+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);

+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);

+  step1[16] = step2[16];

+  step1[31] = step2[31];

+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[19] = step2[19];

+  step1[20] = step2[20];

+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[27] = step2[27];

+  step1[28] = step2[28];

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);

+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);

+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  step2[16] = WRAPLOW(step1[16] + step1[19], 8);

+  step2[17] = WRAPLOW(step1[17] + step1[18], 8);

+  step2[18] = WRAPLOW(step1[17] - step1[18], 8);

+  step2[19] = WRAPLOW(step1[16] - step1[19], 8);

+  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);

+  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);

+  step2[22] = WRAPLOW(step1[21] + step1[22], 8);

+  step2[23] = WRAPLOW(step1[20] + step1[23], 8);

+  step2[24] = WRAPLOW(step1[24] + step1[27], 8);

+  step2[25] = WRAPLOW(step1[25] + step1[26], 8);

+  step2[26] = WRAPLOW(step1[25] - step1[26], 8);

+  step2[27] = WRAPLOW(step1[24] - step1[27], 8);

+  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);

+  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);

+  step2[30] = WRAPLOW(step1[29] + step1[30], 8);

+  step2[31] = WRAPLOW(step1[28] + step1[31], 8);

+  // stage 5

+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);

+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);

+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);

+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[7] = step2[7];

+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);

+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);

+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);

+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);

+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);

+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);

+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);

+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[22] = step2[22];

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[25] = step2[25];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // stage 6

+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);

+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);

+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);

+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);

+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);

+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);

+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);

+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  step2[16] = WRAPLOW(step1[16] + step1[23], 8);

+  step2[17] = WRAPLOW(step1[17] + step1[22], 8);

+  step2[18] = WRAPLOW(step1[18] + step1[21], 8);

+  step2[19] = WRAPLOW(step1[19] + step1[20], 8);

+  step2[20] = WRAPLOW(step1[19] - step1[20], 8);

+  step2[21] = WRAPLOW(step1[18] - step1[21], 8);

+  step2[22] = WRAPLOW(step1[17] - step1[22], 8);

+  step2[23] = WRAPLOW(step1[16] - step1[23], 8);

+  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);

+  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);

+  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);

+  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);

+  step2[28] = WRAPLOW(step1[27] + step1[28], 8);

+  step2[29] = WRAPLOW(step1[26] + step1[29], 8);

+  step2[30] = WRAPLOW(step1[25] + step1[30], 8);

+  step2[31] = WRAPLOW(step1[24] + step1[31], 8);

+  // stage 7

+  step1[0] = WRAPLOW(step2[0] + step2[15], 8);

+  step1[1] = WRAPLOW(step2[1] + step2[14], 8);

+  step1[2] = WRAPLOW(step2[2] + step2[13], 8);

+  step1[3] = WRAPLOW(step2[3] + step2[12], 8);

+  step1[4] = WRAPLOW(step2[4] + step2[11], 8);

+  step1[5] = WRAPLOW(step2[5] + step2[10], 8);

+  step1[6] = WRAPLOW(step2[6] + step2[9], 8);

+  step1[7] = WRAPLOW(step2[7] + step2[8], 8);

+  step1[8] = WRAPLOW(step2[7] - step2[8], 8);

+  step1[9] = WRAPLOW(step2[6] - step2[9], 8);

+  step1[10] = WRAPLOW(step2[5] - step2[10], 8);

+  step1[11] = WRAPLOW(step2[4] - step2[11], 8);

+  step1[12] = WRAPLOW(step2[3] - step2[12], 8);

+  step1[13] = WRAPLOW(step2[2] - step2[13], 8);

+  step1[14] = WRAPLOW(step2[1] - step2[14], 8);

+  step1[15] = WRAPLOW(step2[0] - step2[15], 8);

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  step1[18] = step2[18];

+  step1[19] = step2[19];

+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

+  temp2 = (step2[20] + step2[27]) * cospi_16_64;

+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

+  temp2 = (step2[21] + step2[26]) * cospi_16_64;

+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

+  temp2 = (step2[22] + step2[25]) * cospi_16_64;

+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

+  temp2 = (step2[23] + step2[24]) * cospi_16_64;

+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);

+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);

+  step1[28] = step2[28];

+  step1[29] = step2[29];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // final stage

+  output[0] = WRAPLOW(step1[0] + step1[31], 8);

+  output[1] = WRAPLOW(step1[1] + step1[30], 8);

+  output[2] = WRAPLOW(step1[2] + step1[29], 8);

+  output[3] = WRAPLOW(step1[3] + step1[28], 8);

+  output[4] = WRAPLOW(step1[4] + step1[27], 8);

+  output[5] = WRAPLOW(step1[5] + step1[26], 8);

+  output[6] = WRAPLOW(step1[6] + step1[25], 8);

+  output[7] = WRAPLOW(step1[7] + step1[24], 8);

+  output[8] = WRAPLOW(step1[8] + step1[23], 8);

+  output[9] = WRAPLOW(step1[9] + step1[22], 8);

+  output[10] = WRAPLOW(step1[10] + step1[21], 8);

+  output[11] = WRAPLOW(step1[11] + step1[20], 8);

+  output[12] = WRAPLOW(step1[12] + step1[19], 8);

+  output[13] = WRAPLOW(step1[13] + step1[18], 8);

+  output[14] = WRAPLOW(step1[14] + step1[17], 8);

+  output[15] = WRAPLOW(step1[15] + step1[16], 8);

+  output[16] = WRAPLOW(step1[15] - step1[16], 8);

+  output[17] = WRAPLOW(step1[14] - step1[17], 8);

+  output[18] = WRAPLOW(step1[13] - step1[18], 8);

+  output[19] = WRAPLOW(step1[12] - step1[19], 8);

+  output[20] = WRAPLOW(step1[11] - step1[20], 8);

+  output[21] = WRAPLOW(step1[10] - step1[21], 8);

+  output[22] = WRAPLOW(step1[9] - step1[22], 8);

+  output[23] = WRAPLOW(step1[8] - step1[23], 8);

+  output[24] = WRAPLOW(step1[7] - step1[24], 8);

+  output[25] = WRAPLOW(step1[6] - step1[25], 8);

+  output[26] = WRAPLOW(step1[5] - step1[26], 8);

+  output[27] = WRAPLOW(step1[4] - step1[27], 8);

+  output[28] = WRAPLOW(step1[3] - step1[28], 8);

+  output[29] = WRAPLOW(step1[2] - step1[29], 8);

+  output[30] = WRAPLOW(step1[1] - step1[30], 8);

+  output[31] = WRAPLOW(step1[0] - step1[31], 8);

+}

+void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,

+                              int stride) {

+  tran_low_t out[32 * 32];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[32], temp_out[32];

+  // Rows

+  for (i = 0; i < 32; ++i) {

+    int16_t zero_coeff[16];

+    for (j = 0; j < 16; ++j)

+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];

+    for (j = 0; j < 8; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    for (j = 0; j < 4; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    for (j = 0; j < 2; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    if (zero_coeff[0] | zero_coeff[1])

+      idct32_c(input, outptr);

+    else

+      memset(outptr, 0, sizeof(tran_low_t) * 32);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    idct32_c(temp_in, temp_out);

+    for (j = 0; j < 32; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

+    }

+  }

+}

+void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,

+                            int stride) {

+  tran_low_t out[32 * 32] = {0};

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[32], temp_out[32];

+  // Rows

+  // only upper-left 8x8 has non-zero coeff

+  for (i = 0; i < 8; ++i) {

+    idct32_c(input, outptr);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    idct32_c(temp_in, temp_out);

+    for (j = 0; j < 32; ++j) {

+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));

+    }

+  }

+}

+void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

+  int i, j;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

+  a1 = ROUND_POWER_OF_TWO(out, 6);

+  for (j = 0; j < 32; ++j) {

+    for (i = 0; i < 32; ++i)

+      dest[i] = clip_pixel_add(dest[i], a1);

+    dest += stride;

+  }

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int bd) {

+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

+     0.5 shifts per pixel. */

+  int i;

+  tran_low_t output[16];

+  tran_high_t a1, b1, c1, d1, e1;

+  const tran_low_t *ip = input;

+  tran_low_t *op = output;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] >> UNIT_QUANT_SHIFT;

+    c1 = ip[1] >> UNIT_QUANT_SHIFT;

+    d1 = ip[2] >> UNIT_QUANT_SHIFT;

+    b1 = ip[3] >> UNIT_QUANT_SHIFT;

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    op[0] = WRAPLOW(a1, bd);

+    op[1] = WRAPLOW(b1, bd);

+    op[2] = WRAPLOW(c1, bd);

+    op[3] = WRAPLOW(d1, bd);

+    ip += 4;

+    op += 4;

+  }

+  ip = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[4 * 0];

+    c1 = ip[4 * 1];

+    d1 = ip[4 * 2];

+    b1 = ip[4 * 3];

+    a1 += c1;

+    d1 -= b1;

+    e1 = (a1 - d1) >> 1;

+    b1 = e1 - b1;

+    c1 = e1 - c1;

+    a1 -= b1;

+    d1 += c1;

+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);

+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);

+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);

+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);

+    ip++;

+    dest++;

+  }

+}

+void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,

+                                int dest_stride, int bd) {

+  int i;

+  tran_high_t a1, e1;

+  tran_low_t tmp[4];

+  const tran_low_t *ip = in;

+  tran_low_t *op = tmp;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  (void) bd;

+  a1 = ip[0] >> UNIT_QUANT_SHIFT;

+  e1 = a1 >> 1;

+  a1 -= e1;

+  op[0] = WRAPLOW(a1, bd);

+  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);

+  ip = tmp;

+  for (i = 0; i < 4; i++) {

+    e1 = ip[0] >> 1;

+    a1 = ip[0] - e1;

+    dest[dest_stride * 0] = highbd_clip_pixel_add(

+        dest[dest_stride * 0], a1, bd);

+    dest[dest_stride * 1] = highbd_clip_pixel_add(

+        dest[dest_stride * 1], e1, bd);

+    dest[dest_stride * 2] = highbd_clip_pixel_add(

+        dest[dest_stride * 2], e1, bd);

+    dest[dest_stride * 3] = highbd_clip_pixel_add(

+        dest[dest_stride * 3], e1, bd);

+    ip++;

+    dest++;

+  }

+}

+void vp9_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step[4];

+  tran_high_t temp1, temp2;

+  (void) bd;

+  // stage 1

+  temp1 = (input[0] + input[2]) * cospi_16_64;

+  temp2 = (input[0] - input[2]) * cospi_16_64;

+  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

+  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  // stage 2

+  output[0] = WRAPLOW(step[0] + step[3], bd);

+  output[1] = WRAPLOW(step[1] + step[2], bd);

+  output[2] = WRAPLOW(step[1] - step[2], bd);

+  output[3] = WRAPLOW(step[0] - step[3], bd);

+}

+void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int bd) {

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[4], temp_out[4];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Rows

+  for (i = 0; i < 4; ++i) {

+    vp9_highbd_idct4_c(input, outptr, bd);

+    input += 4;

+    outptr += 4;

+  }

+  // Columns

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    vp9_highbd_idct4_c(temp_in, temp_out, bd);

+    for (j = 0; j < 4; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

+    }

+  }

+}

+void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                                int dest_stride, int bd) {

+  int i;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(

+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  for (i = 0; i < 4; i++) {

+    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);

+    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);

+    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);

+    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);

+    dest += dest_stride;

+  }

+}

+void vp9_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step1[8], step2[8];

+  tran_high_t temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[2] = input[4];

+  step1[1] = input[2];

+  step1[3] = input[6];

+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  // stage 2 & stage 3 - even half

+  vp9_highbd_idct4_c(step1, step1, bd);

+  // stage 2 - odd half

+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);

+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);

+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);

+  // stage 3 - odd half

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[7] = step2[7];

+  // stage 4

+  output[0] = WRAPLOW(step1[0] + step1[7], bd);

+  output[1] = WRAPLOW(step1[1] + step1[6], bd);

+  output[2] = WRAPLOW(step1[2] + step1[5], bd);

+  output[3] = WRAPLOW(step1[3] + step1[4], bd);

+  output[4] = WRAPLOW(step1[3] - step1[4], bd);

+  output[5] = WRAPLOW(step1[2] - step1[5], bd);

+  output[6] = WRAPLOW(step1[1] - step1[6], bd);

+  output[7] = WRAPLOW(step1[0] - step1[7], bd);

+}

+void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int bd) {

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[8], temp_out[8];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows.

+  for (i = 0; i < 8; ++i) {

+    vp9_highbd_idct8_c(input, outptr, bd);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns.

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    vp9_highbd_idct8_c(temp_in, temp_out, bd);

+    for (j = 0; j < 8; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+    }

+  }

+}

+void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                                int stride, int bd) {

+  int i, j;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(

+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

+  a1 = ROUND_POWER_OF_TWO(out, 5);

+  for (j = 0; j < 8; ++j) {

+    for (i = 0; i < 8; ++i)

+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

+    dest += stride;

+  }

+}

+void highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_low_t x0 = input[0];

+  tran_low_t x1 = input[1];

+  tran_low_t x2 = input[2];

+  tran_low_t x3 = input[3];

+  (void) bd;

+  if (!(x0 | x1 | x2 | x3)) {

+    memset(output, 0, 4 * sizeof(*output));

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = (tran_high_t)(x0 - x2 + x3);

+  s0 = s0 + s3 + s5;

+  s1 = s1 - s4 - s6;

+  s3 = s2;

+  s2 = sinpi_3_9 * s7;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);

+  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);

+  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);

+  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);

+}

+void highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

+  tran_low_t x0 = input[7];

+  tran_low_t x1 = input[0];

+  tran_low_t x2 = input[5];

+  tran_low_t x3 = input[2];

+  tran_low_t x4 = input[3];

+  tran_low_t x5 = input[4];

+  tran_low_t x6 = input[1];

+  tran_low_t x7 = input[6];

+  (void) bd;

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    memset(output, 0, 8 * sizeof(*output));

+    return;

+  }

+  // stage 1

+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);

+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);

+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);

+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);

+  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);

+  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);

+  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);

+  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

+  x0 = WRAPLOW(s0 + s2, bd);

+  x1 = WRAPLOW(s1 + s3, bd);

+  x2 = WRAPLOW(s0 - s2, bd);

+  x3 = WRAPLOW(s1 - s3, bd);

+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);

+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);

+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);

+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);

+  // stage 3

+  s2 = cospi_16_64 * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (x6 - x7);

+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);

+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);

+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);

+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);

+  output[0] = WRAPLOW(x0, bd);

+  output[1] = WRAPLOW(-x4, bd);

+  output[2] = WRAPLOW(x6, bd);

+  output[3] = WRAPLOW(-x2, bd);

+  output[4] = WRAPLOW(x3, bd);

+  output[5] = WRAPLOW(-x7, bd);

+  output[6] = WRAPLOW(x5, bd);

+  output[7] = WRAPLOW(-x1, bd);

+}

+void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,

+                                 int stride, int bd) {

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[8], temp_out[8];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows.

+  // Only first 4 row has non-zero coefs.

+  for (i = 0; i < 4; ++i) {

+    vp9_highbd_idct8_c(input, outptr, bd);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns.

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    vp9_highbd_idct8_c(temp_in, temp_out, bd);

+    for (j = 0; j < 8; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+    }

+  }

+}

+void vp9_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step1[16], step2[16];

+  tran_high_t temp1, temp2;

+  (void) bd;

+  // stage 1

+  step1[0] = input[0/2];

+  step1[1] = input[16/2];

+  step1[2] = input[8/2];

+  step1[3] = input[24/2];

+  step1[4] = input[4/2];

+  step1[5] = input[20/2];

+  step1[6] = input[12/2];

+  step1[7] = input[28/2];

+  step1[8] = input[2/2];

+  step1[9] = input[18/2];

+  step1[10] = input[10/2];

+  step1[11] = input[26/2];

+  step1[12] = input[6/2];

+  step1[13] = input[22/2];

+  step1[14] = input[14/2];

+  step1[15] = input[30/2];

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);

+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);

+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);

+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);

+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);

+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);

+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);

+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);

+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);

+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  // stage 5

+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);

+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);

+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);

+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[7] = step2[7];

+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);

+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);

+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);

+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);

+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);

+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);

+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);

+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);

+  // stage 6

+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);

+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);

+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);

+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);

+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);

+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);

+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);

+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  // stage 7

+  output[0] = WRAPLOW(step2[0] + step2[15], bd);

+  output[1] = WRAPLOW(step2[1] + step2[14], bd);

+  output[2] = WRAPLOW(step2[2] + step2[13], bd);

+  output[3] = WRAPLOW(step2[3] + step2[12], bd);

+  output[4] = WRAPLOW(step2[4] + step2[11], bd);

+  output[5] = WRAPLOW(step2[5] + step2[10], bd);

+  output[6] = WRAPLOW(step2[6] + step2[9], bd);

+  output[7] = WRAPLOW(step2[7] + step2[8], bd);

+  output[8] = WRAPLOW(step2[7] - step2[8], bd);

+  output[9] = WRAPLOW(step2[6] - step2[9], bd);

+  output[10] = WRAPLOW(step2[5] - step2[10], bd);

+  output[11] = WRAPLOW(step2[4] - step2[11], bd);

+  output[12] = WRAPLOW(step2[3] - step2[12], bd);

+  output[13] = WRAPLOW(step2[2] - step2[13], bd);

+  output[14] = WRAPLOW(step2[1] - step2[14], bd);

+  output[15] = WRAPLOW(step2[0] - step2[15], bd);

+}

+void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,

+                                    int stride, int bd) {

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[16], temp_out[16];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows.

+  for (i = 0; i < 16; ++i) {

+    vp9_highbd_idct16_c(input, outptr, bd);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns.

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    vp9_highbd_idct16_c(temp_in, temp_out, bd);

+    for (j = 0; j < 16; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+    }

+  }

+}

+void highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

+  tran_high_t s9, s10, s11, s12, s13, s14, s15;

+  tran_low_t x0 = input[15];

+  tran_low_t x1 = input[0];

+  tran_low_t x2 = input[13];

+  tran_low_t x3 = input[2];

+  tran_low_t x4 = input[11];

+  tran_low_t x5 = input[4];

+  tran_low_t x6 = input[9];

+  tran_low_t x7 = input[6];

+  tran_low_t x8 = input[7];

+  tran_low_t x9 = input[8];

+  tran_low_t x10 = input[5];

+  tran_low_t x11 = input[10];

+  tran_low_t x12 = input[3];

+  tran_low_t x13 = input[12];

+  tran_low_t x14 = input[1];

+  tran_low_t x15 = input[14];

+  (void) bd;

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

+    memset(output, 0, 16 * sizeof(*output));

+    return;

+  }

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);

+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);

+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);

+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);

+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);

+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);

+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);

+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);

+  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);

+  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);

+  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);

+  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);

+  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);

+  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);

+  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);

+  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = WRAPLOW(s0 + s4, bd);

+  x1 = WRAPLOW(s1 + s5, bd);

+  x2 = WRAPLOW(s2 + s6, bd);

+  x3 = WRAPLOW(s3 + s7, bd);

+  x4 = WRAPLOW(s0 - s4, bd);

+  x5 = WRAPLOW(s1 - s5, bd);

+  x6 = WRAPLOW(s2 - s6, bd);

+  x7 = WRAPLOW(s3 - s7, bd);

+  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);

+  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);

+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);

+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);

+  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);

+  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);

+  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);

+  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

+  x0 = WRAPLOW(s0 + s2, bd);

+  x1 = WRAPLOW(s1 + s3, bd);

+  x2 = WRAPLOW(s0 - s2, bd);

+  x3 = WRAPLOW(s1 - s3, bd);

+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);

+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);

+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);

+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);

+  x8 = WRAPLOW(s8 + s10, bd);

+  x9 = WRAPLOW(s9 + s11, bd);

+  x10 = WRAPLOW(s8 - s10, bd);

+  x11 = WRAPLOW(s9 - s11, bd);

+  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);

+  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);

+  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);

+  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (-x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (-x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);

+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);

+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);

+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);

+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);

+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);

+  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);

+  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);

+  output[0] = WRAPLOW(x0, bd);

+  output[1] = WRAPLOW(-x8, bd);

+  output[2] = WRAPLOW(x12, bd);

+  output[3] = WRAPLOW(-x4, bd);

+  output[4] = WRAPLOW(x6, bd);

+  output[5] = WRAPLOW(x14, bd);

+  output[6] = WRAPLOW(x10, bd);

+  output[7] = WRAPLOW(x2, bd);

+  output[8] = WRAPLOW(x3, bd);

+  output[9] = WRAPLOW(x11, bd);

+  output[10] = WRAPLOW(x15, bd);

+  output[11] = WRAPLOW(x7, bd);

+  output[12] = WRAPLOW(x5, bd);

+  output[13] = WRAPLOW(-x13, bd);

+  output[14] = WRAPLOW(x9, bd);

+  output[15] = WRAPLOW(-x1, bd);

+}

+void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,

+                                   int stride, int bd) {

+  tran_low_t out[16 * 16] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[16], temp_out[16];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // First transform rows. Since all non-zero dct coefficients are in

+  // upper-left 4x4 area, we only need to calculate first 4 rows here.

+  for (i = 0; i < 4; ++i) {

+    vp9_highbd_idct16_c(input, outptr, bd);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns.

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j*16 + i];

+    vp9_highbd_idct16_c(temp_in, temp_out, bd);

+    for (j = 0; j < 16; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+    }

+  }

+}

+void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                                  int stride, int bd) {

+  int i, j;

+  tran_high_t a1;

+  tran_low_t out = WRAPLOW(

+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

+  a1 = ROUND_POWER_OF_TWO(out, 6);

+  for (j = 0; j < 16; ++j) {

+    for (i = 0; i < 16; ++i)

+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

+    dest += stride;

+  }

+}

+void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) {

+  tran_low_t step1[32], step2[32];

+  tran_high_t temp1, temp2;

+  (void) bd;

+  // stage 1

+  step1[0] = input[0];

+  step1[1] = input[16];

+  step1[2] = input[8];

+  step1[3] = input[24];

+  step1[4] = input[4];

+  step1[5] = input[20];

+  step1[6] = input[12];

+  step1[7] = input[28];

+  step1[8] = input[2];

+  step1[9] = input[18];

+  step1[10] = input[10];

+  step1[11] = input[26];

+  step1[12] = input[6];

+  step1[13] = input[22];

+  step1[14] = input[14];

+  step1[15] = input[30];

+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

+  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step2[16] = WRAPLOW(step1[16] + step1[17], bd);

+  step2[17] = WRAPLOW(step1[16] - step1[17], bd);

+  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);

+  step2[19] = WRAPLOW(step1[18] + step1[19], bd);

+  step2[20] = WRAPLOW(step1[20] + step1[21], bd);

+  step2[21] = WRAPLOW(step1[20] - step1[21], bd);

+  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);

+  step2[23] = WRAPLOW(step1[22] + step1[23], bd);

+  step2[24] = WRAPLOW(step1[24] + step1[25], bd);

+  step2[25] = WRAPLOW(step1[24] - step1[25], bd);

+  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);

+  step2[27] = WRAPLOW(step1[26] + step1[27], bd);

+  step2[28] = WRAPLOW(step1[28] + step1[29], bd);

+  step2[29] = WRAPLOW(step1[28] - step1[29], bd);

+  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);

+  step2[31] = WRAPLOW(step1[30] + step1[31], bd);

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);

+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);

+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);

+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);

+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);

+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);

+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);

+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);

+  step1[16] = step2[16];

+  step1[31] = step2[31];

+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[19] = step2[19];

+  step1[20] = step2[20];

+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[27] = step2[27];

+  step1[28] = step2[28];

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);

+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);

+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  step2[16] = WRAPLOW(step1[16] + step1[19], bd);

+  step2[17] = WRAPLOW(step1[17] + step1[18], bd);

+  step2[18] = WRAPLOW(step1[17] - step1[18], bd);

+  step2[19] = WRAPLOW(step1[16] - step1[19], bd);

+  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);

+  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);

+  step2[22] = WRAPLOW(step1[21] + step1[22], bd);

+  step2[23] = WRAPLOW(step1[20] + step1[23], bd);

+  step2[24] = WRAPLOW(step1[24] + step1[27], bd);

+  step2[25] = WRAPLOW(step1[25] + step1[26], bd);

+  step2[26] = WRAPLOW(step1[25] - step1[26], bd);

+  step2[27] = WRAPLOW(step1[24] - step1[27], bd);

+  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);

+  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);

+  step2[30] = WRAPLOW(step1[29] + step1[30], bd);

+  step2[31] = WRAPLOW(step1[28] + step1[31], bd);

+  // stage 5

+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);

+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);

+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);

+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[7] = step2[7];

+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);

+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);

+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);

+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);

+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);

+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);

+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);

+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[22] = step2[22];

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[25] = step2[25];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // stage 6

+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);

+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);

+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);

+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);

+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);

+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);

+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);

+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  step2[16] = WRAPLOW(step1[16] + step1[23], bd);

+  step2[17] = WRAPLOW(step1[17] + step1[22], bd);

+  step2[18] = WRAPLOW(step1[18] + step1[21], bd);

+  step2[19] = WRAPLOW(step1[19] + step1[20], bd);

+  step2[20] = WRAPLOW(step1[19] - step1[20], bd);

+  step2[21] = WRAPLOW(step1[18] - step1[21], bd);

+  step2[22] = WRAPLOW(step1[17] - step1[22], bd);

+  step2[23] = WRAPLOW(step1[16] - step1[23], bd);

+  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);

+  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);

+  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);

+  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);

+  step2[28] = WRAPLOW(step1[27] + step1[28], bd);

+  step2[29] = WRAPLOW(step1[26] + step1[29], bd);

+  step2[30] = WRAPLOW(step1[25] + step1[30], bd);

+  step2[31] = WRAPLOW(step1[24] + step1[31], bd);

+  // stage 7

+  step1[0] = WRAPLOW(step2[0] + step2[15], bd);

+  step1[1] = WRAPLOW(step2[1] + step2[14], bd);

+  step1[2] = WRAPLOW(step2[2] + step2[13], bd);

+  step1[3] = WRAPLOW(step2[3] + step2[12], bd);

+  step1[4] = WRAPLOW(step2[4] + step2[11], bd);

+  step1[5] = WRAPLOW(step2[5] + step2[10], bd);

+  step1[6] = WRAPLOW(step2[6] + step2[9], bd);

+  step1[7] = WRAPLOW(step2[7] + step2[8], bd);

+  step1[8] = WRAPLOW(step2[7] - step2[8], bd);

+  step1[9] = WRAPLOW(step2[6] - step2[9], bd);

+  step1[10] = WRAPLOW(step2[5] - step2[10], bd);

+  step1[11] = WRAPLOW(step2[4] - step2[11], bd);

+  step1[12] = WRAPLOW(step2[3] - step2[12], bd);

+  step1[13] = WRAPLOW(step2[2] - step2[13], bd);

+  step1[14] = WRAPLOW(step2[1] - step2[14], bd);

+  step1[15] = WRAPLOW(step2[0] - step2[15], bd);

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  step1[18] = step2[18];

+  step1[19] = step2[19];

+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

+  temp2 = (step2[20] + step2[27]) * cospi_16_64;

+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

+  temp2 = (step2[21] + step2[26]) * cospi_16_64;

+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

+  temp2 = (step2[22] + step2[25]) * cospi_16_64;

+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

+  temp2 = (step2[23] + step2[24]) * cospi_16_64;

+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);

+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);

+  step1[28] = step2[28];

+  step1[29] = step2[29];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // final stage

+  output[0] = WRAPLOW(step1[0] + step1[31], bd);

+  output[1] = WRAPLOW(step1[1] + step1[30], bd);

+  output[2] = WRAPLOW(step1[2] + step1[29], bd);

+  output[3] = WRAPLOW(step1[3] + step1[28], bd);

+  output[4] = WRAPLOW(step1[4] + step1[27], bd);

+  output[5] = WRAPLOW(step1[5] + step1[26], bd);

+  output[6] = WRAPLOW(step1[6] + step1[25], bd);

+  output[7] = WRAPLOW(step1[7] + step1[24], bd);

+  output[8] = WRAPLOW(step1[8] + step1[23], bd);

+  output[9] = WRAPLOW(step1[9] + step1[22], bd);

+  output[10] = WRAPLOW(step1[10] + step1[21], bd);

+  output[11] = WRAPLOW(step1[11] + step1[20], bd);

+  output[12] = WRAPLOW(step1[12] + step1[19], bd);

+  output[13] = WRAPLOW(step1[13] + step1[18], bd);

+  output[14] = WRAPLOW(step1[14] + step1[17], bd);

+  output[15] = WRAPLOW(step1[15] + step1[16], bd);

+  output[16] = WRAPLOW(step1[15] - step1[16], bd);

+  output[17] = WRAPLOW(step1[14] - step1[17], bd);

+  output[18] = WRAPLOW(step1[13] - step1[18], bd);

+  output[19] = WRAPLOW(step1[12] - step1[19], bd);

+  output[20] = WRAPLOW(step1[11] - step1[20], bd);

+  output[21] = WRAPLOW(step1[10] - step1[21], bd);

+  output[22] = WRAPLOW(step1[9] - step1[22], bd);

+  output[23] = WRAPLOW(step1[8] - step1[23], bd);

+  output[24] = WRAPLOW(step1[7] - step1[24], bd);

+  output[25] = WRAPLOW(step1[6] - step1[25], bd);

+  output[26] = WRAPLOW(step1[5] - step1[26], bd);

+  output[27] = WRAPLOW(step1[4] - step1[27], bd);

+  output[28] = WRAPLOW(step1[3] - step1[28], bd);

+  output[29] = WRAPLOW(step1[2] - step1[29], bd);

+  output[30] = WRAPLOW(step1[1] - step1[30], bd);

+  output[31] = WRAPLOW(step1[0] - step1[31], bd);

+}

+void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,

+                                     int stride, int bd) {

+  tran_low_t out[32 * 32];

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[32], temp_out[32];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Rows

+  for (i = 0; i < 32; ++i) {

+    tran_low_t zero_coeff[16];

+    for (j = 0; j < 16; ++j)

+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];

+    for (j = 0; j < 8; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    for (j = 0; j < 4; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    for (j = 0; j < 2; ++j)

+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

+    if (zero_coeff[0] | zero_coeff[1])

+      highbd_idct32_c(input, outptr, bd);

+    else

+      memset(outptr, 0, sizeof(tran_low_t) * 32);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    highbd_idct32_c(temp_in, temp_out, bd);

+    for (j = 0; j < 32; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+    }

+  }

+}

+void vp9_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,

+                                   int stride, int bd) {

+  tran_low_t out[32 * 32] = {0};

+  tran_low_t *outptr = out;

+  int i, j;

+  tran_low_t temp_in[32], temp_out[32];

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  // Rows

+  // Only upper-left 8x8 has non-zero coeff.

+  for (i = 0; i < 8; ++i) {

+    highbd_idct32_c(input, outptr, bd);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    highbd_idct32_c(temp_in, temp_out, bd);

+    for (j = 0; j < 32; ++j) {

+      dest[j * stride + i] = highbd_clip_pixel_add(

+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+    }

+  }

+}

+void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,

+                                  int stride, int bd) {

+  int i, j;

+  int a1;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  tran_low_t out = WRAPLOW(

+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);

+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);

+  a1 = ROUND_POWER_OF_TWO(out, 6);

+  for (j = 0; j < 32; ++j) {

+    for (i = 0; i < 32; ++i)

+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

+    dest += stride;

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

--- /dev/null

+++ b/vpx_dsp/inv_txfm.h

@@ -1,0 +1,124 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_INV_TXFM_H_

+#define VPX_DSP_INV_TXFM_H_

+#include <assert.h>

+#include "./vpx_config.h"

+#include "vpx_dsp/txfm_common.h"

+#include "vpx_ports/mem.h"

+#ifdef __cplusplus

+extern "C" {

+#endif

+static INLINE tran_low_t check_range(tran_high_t input) {

+#if CONFIG_COEFFICIENT_RANGE_CHECKING

+  // For valid VP9 input streams, intermediate stage coefficients should always

+  // stay within the range of a signed 16 bit integer. Coefficients can go out

+  // of this range for invalid/corrupt VP9 streams. However, strictly checking

+  // this range for every intermediate coefficient can burdensome for a decoder,

+  // therefore the following assertion is only enabled when configured with

+  // --enable-coefficient-range-checking.

+  assert(INT16_MIN <= input);

+  assert(input <= INT16_MAX);

+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

+  return (tran_low_t)input;

+}

+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {

+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

+  return check_range(rv);

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+static INLINE tran_low_t highbd_check_range(tran_high_t input,

+                                            int bd) {

+#if CONFIG_COEFFICIENT_RANGE_CHECKING

+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will

+  // stay within the ranges:

+  // - 8 bit: signed 16 bit integer

+  // - 10 bit: signed 18 bit integer

+  // - 12 bit: signed 20 bit integer

+  const int32_t int_max = (1 << (7 + bd)) - 1;

+  const int32_t int_min = -int_max - 1;

+  assert(int_min <= input);

+  assert(input <= int_max);

+  (void) int_min;

+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING

+  (void) bd;

+  return (tran_low_t)input;

+}

+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,

+                                                      int bd) {

+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

+  return highbd_check_range(rv, bd);

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_EMULATE_HARDWARE

+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a

+// non-normative method to handle overflows. A stream that causes

+// overflows  in the inverse transform is considered invalid in VP9,

+// and a hardware implementer is free to choose any reasonable

+// method to handle overflows. However to aid in hardware

+// verification they can use a specific implementation of the

+// WRAPLOW() macro below that is identical to their intended

+// hardware implementation (and also use configure options to trigger

+// the C-implementation of the transform).

+//

+// The particular WRAPLOW implementation below performs strict

+// overflow wrapping to match common hardware implementations.

+// bd of 8 uses trans_low with 16bits, need to remove 16bits

+// bd of 10 uses trans_low with 18bits, need to remove 14bits

+// bd of 12 uses trans_low with 20bits, need to remove 12bits

+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits

+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))

+#else

+#define WRAPLOW(x, bd) ((int32_t)(x))

+#endif  // CONFIG_EMULATE_HARDWARE

+void idct4_c(const tran_low_t *input, tran_low_t *output);

+void idct8_c(const tran_low_t *input, tran_low_t *output);

+void idct16_c(const tran_low_t *input, tran_low_t *output);

+void idct32_c(const tran_low_t *input, tran_low_t *output);

+void iadst4_c(const tran_low_t *input, tran_low_t *output);

+void iadst8_c(const tran_low_t *input, tran_low_t *output);

+void iadst16_c(const tran_low_t *input, tran_low_t *output);

+#if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);

+void vp9_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);

+void vp9_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);

+void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);

+void highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);

+void highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);

+void highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);

+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,

+                                             int bd) {

+  trans = WRAPLOW(trans, bd);

+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);

+}

+#endif

+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {

+  trans = WRAPLOW(trans, 8);

+  return clip_pixel(WRAPLOW(dest + trans, 8));

+}

+#ifdef __cplusplus

+}  // extern "C"

+#endif

+#endif  // VPX_DSP_INV_TXFM_H_

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -169,6 +169,43 @@

 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c

 endif  # CONFIG_VP9_ENCODER

+# inverse transform

+ifeq ($(CONFIG_VP9),yes)

+DSP_SRCS-yes            += inv_txfm.h

+DSP_SRCS-yes            += inv_txfm.c

+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h

+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c

+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.asm

+ifeq ($(ARCH_X86_64),yes)

+ifeq ($(CONFIG_USE_X86INC),yes)

+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm

+endif  # CONFIG_USE_X86INC

+endif  # ARCH_X86_64

+ifeq ($(HAVE_NEON_ASM),yes)

+DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)

+DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)

+DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)

+DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)

+DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)

+DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)

+DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)

+DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)

+else

+ifeq ($(HAVE_NEON),yes)

+DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c

+DSP_SRCS-yes  += arm/idct4x4_add_neon.c

+DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c

+DSP_SRCS-yes  += arm/idct8x8_add_neon.c

+DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c

+DSP_SRCS-yes  += arm/idct16x16_add_neon.c

+DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c

+DSP_SRCS-yes  += arm/idct32x32_add_neon.c

+endif  # HAVE_NEON

+endif  # HAVE_NEON_ASM

+DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c

+endif  # CONFIG_VP9

 # quantization

 ifeq ($(CONFIG_VP9_ENCODER),yes)

 DSP_SRCS-yes            += quantize.c

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -593,6 +593,193 @@

 }  # CONFIG_VP9_ENCODER

+# Inverse transform

+if (vpx_config("CONFIG_VP9") eq "yes") {

+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

+  # Note as optimized versions of these functions are added we need to add a check to ensure

+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.

+  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct4x4_1_add/;

+  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct4x4_16_add/;

+  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_1_add/;

+  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_64_add/;

+  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct8x8_12_add/;

+  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_1_add/;

+  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_256_add/;

+  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct16x16_10_add/;

+  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_1024_add/;

+  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_34_add/;

+  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_idct32x32_1_add/;

+  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_iwht4x4_1_add/;

+  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+  specialize qw/vp9_iwht4x4_16_add/;

+  add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_idct4x4_1_add/;

+  add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_idct8x8_1_add/;

+  add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_idct16x16_1_add/;

+  add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_idct32x32_1024_add/;

+  add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_idct32x32_34_add/;

+  add_proto qw/void vp9_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_idct32x32_1_add/;

+  add_proto qw/void vp9_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_iwht4x4_1_add/;

+  add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vp9_highbd_iwht4x4_16_add/;

+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1

+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {

+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct4x4_16_add/;

+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct8x8_64_add/;

+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct8x8_10_add/;

+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct16x16_256_add/;

+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct16x16_10_add/;

+  } else {

+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct4x4_16_add sse2/;

+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct8x8_64_add sse2/;

+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct8x8_10_add sse2/;

+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct16x16_256_add sse2/;

+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+    specialize qw/vp9_highbd_idct16x16_10_add sse2/;

+  }  # CONFIG_EMULATE_HARDWARE

+} else {

+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1

+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {

+    add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct4x4_1_add/;

+    add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct4x4_16_add/;

+    add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct8x8_1_add/;

+    add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct8x8_64_add/;

+    add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct8x8_12_add/;

+    add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct16x16_1_add/;

+    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct16x16_256_add/;

+    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct16x16_10_add/;

+    add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct32x32_1024_add/;

+    add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct32x32_34_add/;

+    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct32x32_1_add/;

+    add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_iwht4x4_1_add/;

+    add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_iwht4x4_16_add/;

+  } else {

+    add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct4x4_1_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct4x4_16_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";

+    add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";

+    add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/;

+    # Need to add 34 eob idct32x32 neon implementation.

+    $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;

+    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/;

+    add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_iwht4x4_1_add msa/;

+    add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

+    specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";

+  }  # CONFIG_EMULATE_HARDWARE

+}  # CONFIG_VP9_HIGHBITDEPTH

+}  # CONFIG_VP9

+#

 # Quantization

 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {

--- /dev/null

+++ b/vpx_dsp/x86/inv_txfm_sse2.asm

@@ -1,0 +1,102 @@

+;

+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION .text

+%macro REORDER_INPUTS 0

+  ; a c d b  to  a b c d

+  SWAP 1, 3, 2

+%endmacro

+%macro TRANSFORM_COLS 0

+  ; input:

+  ; m0 a

+  ; m1 b

+  ; m2 c

+  ; m3 d

+  paddw           m0,        m2

+  psubw           m3,        m1

+  ; wide subtract

+  punpcklwd       m4,        m0

+  punpcklwd       m5,        m3

+  psrad           m4,        16

+  psrad           m5,        16

+  psubd           m4,        m5

+  psrad           m4,        1

+  packssdw        m4,        m4             ; e

+  psubw           m5,        m4,        m1  ; b

+  psubw           m4,        m2             ; c

+  psubw           m0,        m5

+  paddw           m3,        m4

+                                ; m0 a

+  SWAP            1,         5  ; m1 b

+  SWAP            2,         4  ; m2 c

+                                ; m3 d

+%endmacro

+%macro TRANSPOSE_4X4 0

+  punpcklwd       m0,        m2

+  punpcklwd       m1,        m3

+  mova            m2,        m0

+  punpcklwd       m0,        m1

+  punpckhwd       m2,        m1

+  pshufd          m1,        m0, 0x0e

+  pshufd          m3,        m2, 0x0e

+%endmacro

+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3

+%macro TRANSPOSE_4X4_WIDE 0

+  mova            m3, m0

+  punpcklwd       m0, m1

+  punpckhwd       m3, m1

+  mova            m2, m0

+  punpcklwd       m0, m3

+  punpckhwd       m2, m3

+  pshufd          m1, m0, 0x0e

+  pshufd          m3, m2, 0x0e

+%endmacro

+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero

+  movq            m%3,       [outputq]

+  movq            m%4,       [outputq + strideq]

+  punpcklbw       m%3,       m%5

+  punpcklbw       m%4,       m%5

+  paddw           m%1,       m%3

+  paddw           m%2,       m%4

+  packuswb        m%1,       m%5

+  packuswb        m%2,       m%5

+  movd            [outputq], m%1

+  movd            [outputq + strideq], m%2

+%endmacro

+INIT_XMM sse2

+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride

+  mova            m0,        [inputq +  0]

+  mova            m1,        [inputq + 16]

+  psraw           m0,        2

+  psraw           m1,        2

+  TRANSPOSE_4X4_WIDE

+  REORDER_INPUTS

+  TRANSFORM_COLS

+  TRANSPOSE_4X4

+  REORDER_INPUTS

+  TRANSFORM_COLS

+  pxor            m4, m4

+  ADD_STORE_4P_2X  0, 1, 5, 6, 4

+  lea             outputq, [outputq + 2 * strideq]

+  ADD_STORE_4P_2X  2, 3, 5, 6, 4

+  RET

--- /dev/null

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -1,0 +1,4053 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+#define RECON_AND_STORE4X4(dest, in_x) \

+{                                                     \

+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \

+  d0 = _mm_unpacklo_epi8(d0, zero); \

+  d0 = _mm_add_epi16(in_x, d0); \

+  d0 = _mm_packus_epi16(d0, d0); \

+  *(int *)(dest) = _mm_cvtsi128_si32(d0); \

+}

+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i eight = _mm_set1_epi16(8);

+  const __m128i cst = _mm_setr_epi16(

+      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,

+      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

+      (int16_t)cospi_8_64, (int16_t)cospi_24_64);

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i input0, input1, input2, input3;

+  // Rows

+  input0 = _mm_load_si128((const __m128i *)input);

+  input2 = _mm_load_si128((const __m128i *)(input + 8));

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  input0 = _mm_shufflelo_epi16(input0, 0xd8);

+  input0 = _mm_shufflehi_epi16(input0, 0xd8);

+  input2 = _mm_shufflelo_epi16(input2, 0xd8);

+  input2 = _mm_shufflehi_epi16(input2, 0xd8);

+  input1 = _mm_unpackhi_epi32(input0, input0);

+  input0 = _mm_unpacklo_epi32(input0, input0);

+  input3 = _mm_unpackhi_epi32(input2, input2);

+  input2 = _mm_unpacklo_epi32(input2, input2);

+  // Stage 1

+  input0 = _mm_madd_epi16(input0, cst);

+  input1 = _mm_madd_epi16(input1, cst);

+  input2 = _mm_madd_epi16(input2, cst);

+  input3 = _mm_madd_epi16(input3, cst);

+  input0 = _mm_add_epi32(input0, rounding);

+  input1 = _mm_add_epi32(input1, rounding);

+  input2 = _mm_add_epi32(input2, rounding);

+  input3 = _mm_add_epi32(input3, rounding);

+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

+  // Stage 2

+  input0 = _mm_packs_epi32(input0, input1);

+  input1 = _mm_packs_epi32(input2, input3);

+  // Transpose

+  input2 = _mm_unpacklo_epi16(input0, input1);

+  input3 = _mm_unpackhi_epi16(input0, input1);

+  input0 = _mm_unpacklo_epi32(input2, input3);

+  input1 = _mm_unpackhi_epi32(input2, input3);

+  // Switch column2, column 3, and then, we got:

+  // input2: column1, column 0;  input3: column2, column 3.

+  input1 = _mm_shuffle_epi32(input1, 0x4e);

+  input2 = _mm_add_epi16(input0, input1);

+  input3 = _mm_sub_epi16(input0, input1);

+  // Columns

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  input0 = _mm_unpacklo_epi32(input2, input2);

+  input1 = _mm_unpackhi_epi32(input2, input2);

+  input2 = _mm_unpackhi_epi32(input3, input3);

+  input3 = _mm_unpacklo_epi32(input3, input3);

+  // Stage 1

+  input0 = _mm_madd_epi16(input0, cst);

+  input1 = _mm_madd_epi16(input1, cst);

+  input2 = _mm_madd_epi16(input2, cst);

+  input3 = _mm_madd_epi16(input3, cst);

+  input0 = _mm_add_epi32(input0, rounding);

+  input1 = _mm_add_epi32(input1, rounding);

+  input2 = _mm_add_epi32(input2, rounding);

+  input3 = _mm_add_epi32(input3, rounding);

+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

+  // Stage 2

+  input0 = _mm_packs_epi32(input0, input2);

+  input1 = _mm_packs_epi32(input1, input3);

+  // Transpose

+  input2 = _mm_unpacklo_epi16(input0, input1);

+  input3 = _mm_unpackhi_epi16(input0, input1);

+  input0 = _mm_unpacklo_epi32(input2, input3);

+  input1 = _mm_unpackhi_epi32(input2, input3);

+  // Switch column2, column 3, and then, we got:

+  // input2: column1, column 0;  input3: column2, column 3.

+  input1 = _mm_shuffle_epi32(input1, 0x4e);

+  input2 = _mm_add_epi16(input0, input1);

+  input3 = _mm_sub_epi16(input0, input1);

+  // Final round and shift

+  input2 = _mm_add_epi16(input2, eight);

+  input3 = _mm_add_epi16(input3, eight);

+  input2 = _mm_srai_epi16(input2, 4);

+  input3 = _mm_srai_epi16(input3, 4);

+  // Reconstruction and Store

+  {

+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));

+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));

+    d0 = _mm_unpacklo_epi32(d0,

+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));

+    d2 = _mm_unpacklo_epi32(

+        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);

+    d0 = _mm_unpacklo_epi8(d0, zero);

+    d2 = _mm_unpacklo_epi8(d2, zero);

+    d0 = _mm_add_epi16(d0, input2);

+    d2 = _mm_add_epi16(d2, input3);

+    d0 = _mm_packus_epi16(d0, d2);

+    // store input0

+    *(int *)dest = _mm_cvtsi128_si32(d0);

+    // store input1

+    d0 = _mm_srli_si128(d0, 4);

+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);

+    // store input2

+    d0 = _mm_srli_si128(d0, 4);

+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);

+    // store input3

+    d0 = _mm_srli_si128(d0, 4);

+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);

+  }

+}

+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

+  __m128i dc_value;

+  const __m128i zero = _mm_setzero_si128();

+  int a;

+  a = dct_const_round_shift(input[0] * cospi_16_64);

+  a = dct_const_round_shift(a * cospi_16_64);

+  a = ROUND_POWER_OF_TWO(a, 4);

+  dc_value = _mm_set1_epi16(a);

+  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);

+  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);

+  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);

+  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);

+}

+static INLINE void transpose_4x4(__m128i *res) {

+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);

+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);

+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);

+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);

+}

+void idct4_sse2(__m128i *in) {

+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i u[8], v[8];

+  transpose_4x4(in);

+  // stage 1

+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);

+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

+  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);

+  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);

+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  u[0] = _mm_packs_epi32(v[0], v[1]);

+  u[1] = _mm_packs_epi32(v[3], v[2]);

+  // stage 2

+  in[0] = _mm_add_epi16(u[0], u[1]);

+  in[1] = _mm_sub_epi16(u[0], u[1]);

+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);

+}

+void iadst4_sse2(__m128i *in) {

+  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);

+  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);

+  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);

+  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);

+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);

+  const __m128i kZero = _mm_set1_epi16(0);

+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i u[8], v[8], in7;

+  transpose_4x4(in);

+  in7 = _mm_srli_si128(in[1], 8);

+  in7 = _mm_add_epi16(in7, in[0]);

+  in7 = _mm_sub_epi16(in7, in[1]);

+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);

+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);

+  u[2] = _mm_unpacklo_epi16(in7, kZero);

+  u[3] = _mm_unpackhi_epi16(in[0], kZero);

+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3

+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5

+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2

+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4

+  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6

+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2

+  u[0] = _mm_add_epi32(v[0], v[1]);

+  u[1] = _mm_add_epi32(v[3], v[4]);

+  u[2] = v[2];

+  u[3] = _mm_add_epi32(u[0], u[1]);

+  u[4] = _mm_slli_epi32(v[5], 2);

+  u[5] = _mm_add_epi32(u[3], v[5]);

+  u[6] = _mm_sub_epi32(u[5], u[4]);

+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

+  in[0] = _mm_packs_epi32(u[0], u[1]);

+  in[1] = _mm_packs_epi32(u[2], u[3]);

+}

+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \

+                      out0, out1, out2, out3, out4, out5, out6, out7) \

+  {                                                     \

+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \

+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \

+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \

+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \

+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \

+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \

+                                                        \

+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \

+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \

+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \

+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \

+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \

+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \

+                                                            \

+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \

+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \

+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \

+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \

+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \

+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \

+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \

+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \

+  }

+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \

+                         out0, out1, out2, out3) \

+  {                                              \

+    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \

+    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \

+    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \

+    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \

+    \

+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \

+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \

+    \

+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \

+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \

+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \

+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \

+  }

+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \

+  {                                            \

+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

+    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

+    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

+  }

+// Define Macro for multiplying elements by constants and adding them together.

+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \

+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \

+  {   \

+      tmp0 = _mm_madd_epi16(lo_0, cst0); \

+      tmp1 = _mm_madd_epi16(hi_0, cst0); \

+      tmp2 = _mm_madd_epi16(lo_0, cst1); \

+      tmp3 = _mm_madd_epi16(hi_0, cst1); \

+      tmp4 = _mm_madd_epi16(lo_1, cst2); \

+      tmp5 = _mm_madd_epi16(hi_1, cst2); \

+      tmp6 = _mm_madd_epi16(lo_1, cst3); \

+      tmp7 = _mm_madd_epi16(hi_1, cst3); \

+      \

+      tmp0 = _mm_add_epi32(tmp0, rounding); \

+      tmp1 = _mm_add_epi32(tmp1, rounding); \

+      tmp2 = _mm_add_epi32(tmp2, rounding); \

+      tmp3 = _mm_add_epi32(tmp3, rounding); \

+      tmp4 = _mm_add_epi32(tmp4, rounding); \

+      tmp5 = _mm_add_epi32(tmp5, rounding); \

+      tmp6 = _mm_add_epi32(tmp6, rounding); \

+      tmp7 = _mm_add_epi32(tmp7, rounding); \

+      \

+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \

+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \

+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \

+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \

+      \

+      res0 = _mm_packs_epi32(tmp0, tmp1); \

+      res1 = _mm_packs_epi32(tmp2, tmp3); \

+      res2 = _mm_packs_epi32(tmp4, tmp5); \

+      res3 = _mm_packs_epi32(tmp6, tmp7); \

+  }

+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \

+  {   \

+      tmp0 = _mm_madd_epi16(lo_0, cst0); \

+      tmp1 = _mm_madd_epi16(hi_0, cst0); \

+      tmp2 = _mm_madd_epi16(lo_0, cst1); \

+      tmp3 = _mm_madd_epi16(hi_0, cst1); \

+      \

+      tmp0 = _mm_add_epi32(tmp0, rounding); \

+      tmp1 = _mm_add_epi32(tmp1, rounding); \

+      tmp2 = _mm_add_epi32(tmp2, rounding); \

+      tmp3 = _mm_add_epi32(tmp3, rounding); \

+      \

+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+      \

+      res0 = _mm_packs_epi32(tmp0, tmp1); \

+      res1 = _mm_packs_epi32(tmp2, tmp3); \

+  }

+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \

+              out0, out1, out2, out3, out4, out5, out6, out7)  \

+  { \

+  /* Stage1 */      \

+  { \

+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \

+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \

+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \

+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \

+    \

+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \

+                          stg1_1, stg1_2, stg1_3, stp1_4,      \

+                          stp1_7, stp1_5, stp1_6)              \

+  } \

+    \

+  /* Stage2 */ \

+  { \

+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \

+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \

+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \

+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \

+    \

+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \

+                           stg2_1, stg2_2, stg2_3, stp2_0,     \

+                           stp2_1, stp2_2, stp2_3)             \

+    \

+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \

+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \

+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \

+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \

+  } \

+    \

+  /* Stage3 */ \

+  { \

+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

+    \

+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \

+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \

+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \

+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \

+    \

+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \

+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \

+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \

+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \

+    \

+    tmp0 = _mm_add_epi32(tmp0, rounding); \

+    tmp1 = _mm_add_epi32(tmp1, rounding); \

+    tmp2 = _mm_add_epi32(tmp2, rounding); \

+    tmp3 = _mm_add_epi32(tmp3, rounding); \

+    \

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+    \

+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

+  } \

+  \

+  /* Stage4  */ \

+  out0 = _mm_adds_epi16(stp1_0, stp2_7); \

+  out1 = _mm_adds_epi16(stp1_1, stp1_6); \

+  out2 = _mm_adds_epi16(stp1_2, stp1_5); \

+  out3 = _mm_adds_epi16(stp1_3, stp2_4); \

+  out4 = _mm_subs_epi16(stp1_3, stp2_4); \

+  out5 = _mm_subs_epi16(stp1_2, stp1_5); \

+  out6 = _mm_subs_epi16(stp1_1, stp1_6); \

+  out7 = _mm_subs_epi16(stp1_0, stp2_7); \

+  }

+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);

+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i;

+  // Load input data.

+  in0 = _mm_load_si128((const __m128i *)input);

+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));

+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));

+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));

+  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));

+  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));

+  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));

+  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));

+  // 2-D

+  for (i = 0; i < 2; i++) {

+    // 8x8 Transpose is copied from vp9_fdct8x8_sse2()

+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,

+                  in0, in1, in2, in3, in4, in5, in6, in7);

+    // 4-stage 1D idct8x8

+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,

+          in0, in1, in2, in3, in4, in5, in6, in7);

+  }

+  // Final rounding and shift

+  in0 = _mm_adds_epi16(in0, final_rounding);

+  in1 = _mm_adds_epi16(in1, final_rounding);

+  in2 = _mm_adds_epi16(in2, final_rounding);

+  in3 = _mm_adds_epi16(in3, final_rounding);

+  in4 = _mm_adds_epi16(in4, final_rounding);

+  in5 = _mm_adds_epi16(in5, final_rounding);

+  in6 = _mm_adds_epi16(in6, final_rounding);

+  in7 = _mm_adds_epi16(in7, final_rounding);

+  in0 = _mm_srai_epi16(in0, 5);

+  in1 = _mm_srai_epi16(in1, 5);

+  in2 = _mm_srai_epi16(in2, 5);

+  in3 = _mm_srai_epi16(in3, 5);

+  in4 = _mm_srai_epi16(in4, 5);

+  in5 = _mm_srai_epi16(in5, 5);

+  in6 = _mm_srai_epi16(in6, 5);

+  in7 = _mm_srai_epi16(in7, 5);

+  RECON_AND_STORE(dest + 0 * stride, in0);

+  RECON_AND_STORE(dest + 1 * stride, in1);

+  RECON_AND_STORE(dest + 2 * stride, in2);

+  RECON_AND_STORE(dest + 3 * stride, in3);

+  RECON_AND_STORE(dest + 4 * stride, in4);

+  RECON_AND_STORE(dest + 5 * stride, in5);

+  RECON_AND_STORE(dest + 6 * stride, in6);

+  RECON_AND_STORE(dest + 7 * stride, in7);

+}

+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

+  __m128i dc_value;

+  const __m128i zero = _mm_setzero_si128();

+  int a;

+  a = dct_const_round_shift(input[0] * cospi_16_64);

+  a = dct_const_round_shift(a * cospi_16_64);

+  a = ROUND_POWER_OF_TWO(a, 5);

+  dc_value = _mm_set1_epi16(a);

+  RECON_AND_STORE(dest + 0 * stride, dc_value);

+  RECON_AND_STORE(dest + 1 * stride, dc_value);

+  RECON_AND_STORE(dest + 2 * stride, dc_value);

+  RECON_AND_STORE(dest + 3 * stride, dc_value);

+  RECON_AND_STORE(dest + 4 * stride, dc_value);

+  RECON_AND_STORE(dest + 5 * stride, dc_value);

+  RECON_AND_STORE(dest + 6 * stride, dc_value);

+  RECON_AND_STORE(dest + 7 * stride, dc_value);

+}

+void idct8_sse2(__m128i *in) {

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  // 8x8 Transpose is copied from vp9_fdct8x8_sse2()

+  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],

+                in0, in1, in2, in3, in4, in5, in6, in7);

+  // 4-stage 1D idct8x8

+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,

+        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);

+}

+void iadst8_sse2(__m128i *in) {

+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);

+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);

+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);

+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);

+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);

+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

+  const __m128i k__const_0 = _mm_set1_epi16(0);

+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;

+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;

+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;

+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;

+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

+  // transpose

+  array_transpose_8x8(in, in);

+  // properly aligned for butterfly input

+  in0 = in[7];

+  in1 = in[0];

+  in2 = in[5];

+  in3 = in[2];

+  in4 = in[3];

+  in5 = in[4];

+  in6 = in[1];

+  in7 = in[6];

+  // column transformation

+  // stage 1

+  // interleave and multiply/add into 32-bit integer

+  s0 = _mm_unpacklo_epi16(in0, in1);

+  s1 = _mm_unpackhi_epi16(in0, in1);

+  s2 = _mm_unpacklo_epi16(in2, in3);

+  s3 = _mm_unpackhi_epi16(in2, in3);

+  s4 = _mm_unpacklo_epi16(in4, in5);

+  s5 = _mm_unpackhi_epi16(in4, in5);

+  s6 = _mm_unpacklo_epi16(in6, in7);

+  s7 = _mm_unpackhi_epi16(in6, in7);

+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);

+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);

+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);

+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);

+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);

+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);

+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);

+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);

+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);

+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);

+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);

+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);

+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);

+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);

+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);

+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);

+  // addition

+  w0 = _mm_add_epi32(u0, u8);

+  w1 = _mm_add_epi32(u1, u9);

+  w2 = _mm_add_epi32(u2, u10);

+  w3 = _mm_add_epi32(u3, u11);

+  w4 = _mm_add_epi32(u4, u12);

+  w5 = _mm_add_epi32(u5, u13);

+  w6 = _mm_add_epi32(u6, u14);

+  w7 = _mm_add_epi32(u7, u15);

+  w8 = _mm_sub_epi32(u0, u8);

+  w9 = _mm_sub_epi32(u1, u9);

+  w10 = _mm_sub_epi32(u2, u10);

+  w11 = _mm_sub_epi32(u3, u11);

+  w12 = _mm_sub_epi32(u4, u12);

+  w13 = _mm_sub_epi32(u5, u13);

+  w14 = _mm_sub_epi32(u6, u14);

+  w15 = _mm_sub_epi32(u7, u15);

+  // shift and rounding

+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);

+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);

+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);

+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);

+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);

+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);

+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);

+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);

+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);

+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);

+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);

+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);

+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);

+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);

+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);

+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);

+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);

+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);

+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);

+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);

+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);

+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);

+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);

+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);

+  // back to 16-bit and pack 8 integers into __m128i

+  in[0] = _mm_packs_epi32(u0, u1);

+  in[1] = _mm_packs_epi32(u2, u3);

+  in[2] = _mm_packs_epi32(u4, u5);

+  in[3] = _mm_packs_epi32(u6, u7);

+  in[4] = _mm_packs_epi32(u8, u9);

+  in[5] = _mm_packs_epi32(u10, u11);

+  in[6] = _mm_packs_epi32(u12, u13);

+  in[7] = _mm_packs_epi32(u14, u15);

+  // stage 2

+  s0 = _mm_add_epi16(in[0], in[2]);

+  s1 = _mm_add_epi16(in[1], in[3]);

+  s2 = _mm_sub_epi16(in[0], in[2]);

+  s3 = _mm_sub_epi16(in[1], in[3]);

+  u0 = _mm_unpacklo_epi16(in[4], in[5]);

+  u1 = _mm_unpackhi_epi16(in[4], in[5]);

+  u2 = _mm_unpacklo_epi16(in[6], in[7]);

+  u3 = _mm_unpackhi_epi16(in[6], in[7]);

+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);

+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);

+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);

+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);

+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);

+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);

+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);

+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);

+  w0 = _mm_add_epi32(v0, v4);

+  w1 = _mm_add_epi32(v1, v5);

+  w2 = _mm_add_epi32(v2, v6);

+  w3 = _mm_add_epi32(v3, v7);

+  w4 = _mm_sub_epi32(v0, v4);

+  w5 = _mm_sub_epi32(v1, v5);

+  w6 = _mm_sub_epi32(v2, v6);

+  w7 = _mm_sub_epi32(v3, v7);

+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);

+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);

+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);

+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);

+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);

+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);

+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);

+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);

+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

+  // back to 16-bit intergers

+  s4 = _mm_packs_epi32(u0, u1);

+  s5 = _mm_packs_epi32(u2, u3);

+  s6 = _mm_packs_epi32(u4, u5);

+  s7 = _mm_packs_epi32(u6, u7);

+  // stage 3

+  u0 = _mm_unpacklo_epi16(s2, s3);

+  u1 = _mm_unpackhi_epi16(s2, s3);

+  u2 = _mm_unpacklo_epi16(s6, s7);

+  u3 = _mm_unpackhi_epi16(s6, s7);

+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);

+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);

+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);

+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);

+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);

+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);

+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);

+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);

+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);

+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);

+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);

+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);

+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);

+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);

+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);

+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);

+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);

+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);

+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);

+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);

+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);

+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);

+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);

+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);

+  s2 = _mm_packs_epi32(v0, v1);

+  s3 = _mm_packs_epi32(v2, v3);

+  s6 = _mm_packs_epi32(v4, v5);

+  s7 = _mm_packs_epi32(v6, v7);

+  in[0] = s0;

+  in[1] = _mm_sub_epi16(k__const_0, s4);

+  in[2] = s6;

+  in[3] = _mm_sub_epi16(k__const_0, s2);

+  in[4] = s3;

+  in[5] = _mm_sub_epi16(k__const_0, s7);

+  in[6] = s5;

+  in[7] = _mm_sub_epi16(k__const_0, s1);

+}

+void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);

+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  // Rows. Load 4-row input data.

+  in0 = _mm_load_si128((const __m128i *)input);

+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));

+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));

+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));

+  // 8x4 Transpose

+  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);

+  // Stage1

+  {

+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);

+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);

+    tmp0 = _mm_madd_epi16(lo_17, stg1_0);

+    tmp2 = _mm_madd_epi16(lo_17, stg1_1);

+    tmp4 = _mm_madd_epi16(lo_35, stg1_2);

+    tmp6 = _mm_madd_epi16(lo_35, stg1_3);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);

+    stp1_5 = _mm_packs_epi32(tmp4, tmp6);

+  }

+  // Stage2

+  {

+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);

+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);

+    tmp0 = _mm_madd_epi16(lo_04, stg2_0);

+    tmp2 = _mm_madd_epi16(lo_04, stg2_1);

+    tmp4 = _mm_madd_epi16(lo_26, stg2_2);

+    tmp6 = _mm_madd_epi16(lo_26, stg2_3);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    stp2_0 = _mm_packs_epi32(tmp0, tmp2);

+    stp2_2 = _mm_packs_epi32(tmp6, tmp4);

+    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);

+    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);

+    stp2_4 = tmp0;

+    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);

+    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);

+  }

+  // Stage3

+  {

+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);

+    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);

+    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);

+    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);

+    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);

+    tmp0 = _mm_madd_epi16(lo_56, stg3_0);

+    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    stp1_5 = _mm_packs_epi32(tmp0, tmp2);

+  }

+  // Stage4

+  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);

+  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);

+  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);

+  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);

+  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)

+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,

+        in0, in1, in2, in3, in4, in5, in6, in7);

+  // Final rounding and shift

+  in0 = _mm_adds_epi16(in0, final_rounding);

+  in1 = _mm_adds_epi16(in1, final_rounding);

+  in2 = _mm_adds_epi16(in2, final_rounding);

+  in3 = _mm_adds_epi16(in3, final_rounding);

+  in4 = _mm_adds_epi16(in4, final_rounding);

+  in5 = _mm_adds_epi16(in5, final_rounding);

+  in6 = _mm_adds_epi16(in6, final_rounding);

+  in7 = _mm_adds_epi16(in7, final_rounding);

+  in0 = _mm_srai_epi16(in0, 5);

+  in1 = _mm_srai_epi16(in1, 5);

+  in2 = _mm_srai_epi16(in2, 5);

+  in3 = _mm_srai_epi16(in3, 5);

+  in4 = _mm_srai_epi16(in4, 5);

+  in5 = _mm_srai_epi16(in5, 5);

+  in6 = _mm_srai_epi16(in6, 5);

+  in7 = _mm_srai_epi16(in7, 5);

+  RECON_AND_STORE(dest + 0 * stride, in0);

+  RECON_AND_STORE(dest + 1 * stride, in1);

+  RECON_AND_STORE(dest + 2 * stride, in2);

+  RECON_AND_STORE(dest + 3 * stride, in3);

+  RECON_AND_STORE(dest + 4 * stride, in4);

+  RECON_AND_STORE(dest + 5 * stride, in5);

+  RECON_AND_STORE(dest + 6 * stride, in6);

+  RECON_AND_STORE(dest + 7 * stride, in7);

+}

+#define IDCT16 \

+  /* Stage2 */ \

+  { \

+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \

+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \

+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \

+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \

+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \

+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \

+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \

+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \

+    \

+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \

+                           stg2_0, stg2_1, stg2_2, stg2_3, \

+                           stp2_8, stp2_15, stp2_9, stp2_14) \

+    \

+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \

+                           stg2_4, stg2_5, stg2_6, stg2_7, \

+                           stp2_10, stp2_13, stp2_11, stp2_12) \

+  } \

+    \

+  /* Stage3 */ \

+  { \

+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \

+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \

+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \

+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \

+    \

+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \

+                           stg3_0, stg3_1, stg3_2, stg3_3, \

+                           stp1_4, stp1_7, stp1_5, stp1_6) \

+    \

+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \

+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \

+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \

+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \

+    \

+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \

+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \

+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \

+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \

+  } \

+  \

+  /* Stage4 */ \

+  { \

+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \

+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \

+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \

+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \

+    \

+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \

+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+    \

+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \

+                           stg4_0, stg4_1, stg4_2, stg4_3, \

+                           stp2_0, stp2_1, stp2_2, stp2_3) \

+    \

+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \

+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \

+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \

+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \

+    \

+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \

+                           stg4_4, stg4_5, stg4_6, stg4_7, \

+                           stp2_9, stp2_14, stp2_10, stp2_13) \

+  } \

+    \

+  /* Stage5 */ \

+  { \

+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

+    \

+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \

+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \

+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \

+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \

+    \

+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

+    \

+    tmp0 = _mm_add_epi32(tmp0, rounding); \

+    tmp1 = _mm_add_epi32(tmp1, rounding); \

+    tmp2 = _mm_add_epi32(tmp2, rounding); \

+    tmp3 = _mm_add_epi32(tmp3, rounding); \

+    \

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+    \

+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

+    \

+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \

+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \

+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \

+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \

+    \

+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \

+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \

+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \

+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \

+  } \

+    \

+  /* Stage6 */ \

+  { \

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

+    \

+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \

+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \

+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \

+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \

+    \

+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

+                           stg6_0, stg4_0, stg6_0, stg4_0, \

+                           stp2_10, stp2_13, stp2_11, stp2_12) \

+  }

+#define IDCT16_10 \

+    /* Stage2 */ \

+    { \

+      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \

+      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \

+      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \

+      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \

+      \

+      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \

+                             stg2_0, stg2_1, stg2_6, stg2_7, \

+                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \

+    } \

+      \

+    /* Stage3 */ \

+    { \

+      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \

+      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \

+      \

+      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \

+                               stg3_0, stg3_1,  \

+                               stp2_4, stp2_7) \

+      \

+      stp1_9  =  stp1_8_0; \

+      stp1_10 =  stp1_11;  \

+      \

+      stp1_13 = stp1_12_0; \

+      stp1_14 = stp1_15;   \

+    } \

+    \

+    /* Stage4 */ \

+    { \

+      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \

+      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \

+      \

+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \

+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \

+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+      \

+      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \

+                               stg4_0, stg4_1, \

+                               stp1_0, stp1_1) \

+      stp2_5 = stp2_4; \

+      stp2_6 = stp2_7; \

+      \

+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \

+                             stg4_4, stg4_5, stg4_6, stg4_7, \

+                             stp2_9, stp2_14, stp2_10, stp2_13) \

+    } \

+      \

+    /* Stage5 */ \

+    { \

+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

+      \

+      stp1_2 = stp1_1; \

+      stp1_3 = stp1_0; \

+      \

+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

+      \

+      tmp0 = _mm_add_epi32(tmp0, rounding); \

+      tmp1 = _mm_add_epi32(tmp1, rounding); \

+      tmp2 = _mm_add_epi32(tmp2, rounding); \

+      tmp3 = _mm_add_epi32(tmp3, rounding); \

+      \

+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+      \

+      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

+      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

+      \

+      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \

+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \

+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \

+      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \

+      \

+      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \

+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \

+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \

+      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \

+    } \

+      \

+    /* Stage6 */ \

+    { \

+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

+      \

+      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \

+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

+      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \

+      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \

+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

+      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \

+      \

+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

+                             stg6_0, stg4_0, stg6_0, stg4_0, \

+                             stp2_10, stp2_13, stp2_11, stp2_12) \

+    }

+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,

+                                int stride) {

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);

+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);

+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);

+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);

+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);

+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in[16], l[16], r[16], *curr1;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

+          stp1_8_0, stp1_12_0;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i;

+  curr1 = l;

+  for (i = 0; i < 2; i++) {

+    // 1-D idct

+    // Load input data.

+    in[0] = _mm_load_si128((const __m128i *)input);

+    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));

+    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));

+    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));

+    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));

+    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));

+    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));

+    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));

+    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));

+    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));

+    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));

+    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));

+    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));

+    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));

+    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));

+    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));

+    array_transpose_8x8(in, in);

+    array_transpose_8x8(in + 8, in + 8);

+    IDCT16

+    // Stage7

+    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);

+    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);

+    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);

+    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);

+    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);

+    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);

+    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);

+    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);

+    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);

+    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);

+    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);

+    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);

+    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);

+    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);

+    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);

+    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);

+    curr1 = r;

+    input += 128;

+  }

+  for (i = 0; i < 2; i++) {

+    int j;

+    // 1-D idct

+    array_transpose_8x8(l + i * 8, in);

+    array_transpose_8x8(r + i * 8, in + 8);

+    IDCT16

+    // 2-D

+    in[0] = _mm_add_epi16(stp2_0, stp1_15);

+    in[1] = _mm_add_epi16(stp2_1, stp1_14);

+    in[2] = _mm_add_epi16(stp2_2, stp2_13);

+    in[3] = _mm_add_epi16(stp2_3, stp2_12);

+    in[4] = _mm_add_epi16(stp2_4, stp2_11);

+    in[5] = _mm_add_epi16(stp2_5, stp2_10);

+    in[6] = _mm_add_epi16(stp2_6, stp1_9);

+    in[7] = _mm_add_epi16(stp2_7, stp1_8);

+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);

+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);

+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);

+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);

+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);

+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);

+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);

+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);

+    for (j = 0; j < 16; ++j) {

+      // Final rounding and shift

+      in[j] = _mm_adds_epi16(in[j], final_rounding);

+      in[j] = _mm_srai_epi16(in[j], 6);

+      RECON_AND_STORE(dest + j * stride, in[j]);

+    }

+    dest += 8;

+  }

+}

+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

+  __m128i dc_value;

+  const __m128i zero = _mm_setzero_si128();

+  int a, i;

+  a = dct_const_round_shift(input[0] * cospi_16_64);

+  a = dct_const_round_shift(a * cospi_16_64);

+  a = ROUND_POWER_OF_TWO(a, 6);

+  dc_value = _mm_set1_epi16(a);

+  for (i = 0; i < 2; ++i) {

+    RECON_AND_STORE(dest +  0 * stride, dc_value);

+    RECON_AND_STORE(dest +  1 * stride, dc_value);

+    RECON_AND_STORE(dest +  2 * stride, dc_value);

+    RECON_AND_STORE(dest +  3 * stride, dc_value);

+    RECON_AND_STORE(dest +  4 * stride, dc_value);

+    RECON_AND_STORE(dest +  5 * stride, dc_value);

+    RECON_AND_STORE(dest +  6 * stride, dc_value);

+    RECON_AND_STORE(dest +  7 * stride, dc_value);

+    RECON_AND_STORE(dest +  8 * stride, dc_value);

+    RECON_AND_STORE(dest +  9 * stride, dc_value);

+    RECON_AND_STORE(dest + 10 * stride, dc_value);

+    RECON_AND_STORE(dest + 11 * stride, dc_value);

+    RECON_AND_STORE(dest + 12 * stride, dc_value);

+    RECON_AND_STORE(dest + 13 * stride, dc_value);

+    RECON_AND_STORE(dest + 14 * stride, dc_value);

+    RECON_AND_STORE(dest + 15 * stride, dc_value);

+    dest += 8;

+  }

+}

+static void iadst16_8col(__m128i *in) {

+  // perform 16x16 1-D ADST for 8 columns

+  __m128i s[16], x[16], u[32], v[32];

+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);

+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);

+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);

+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);

+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);

+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);

+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);

+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);

+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);

+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);

+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);

+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);

+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);

+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);

+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);

+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);

+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);

+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);

+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);

+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);

+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);

+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);

+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i kZero = _mm_set1_epi16(0);

+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);

+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);

+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);

+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);

+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);

+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);

+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);

+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);

+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);

+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);

+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);

+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);

+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);

+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);

+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);

+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);

+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);

+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);

+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);

+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);

+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);

+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);

+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);

+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);

+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);

+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);

+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);

+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);

+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);

+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);

+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);

+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);

+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);

+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);

+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);

+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);

+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);

+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);

+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);

+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);

+  u[0] = _mm_add_epi32(v[0], v[16]);

+  u[1] = _mm_add_epi32(v[1], v[17]);

+  u[2] = _mm_add_epi32(v[2], v[18]);

+  u[3] = _mm_add_epi32(v[3], v[19]);

+  u[4] = _mm_add_epi32(v[4], v[20]);

+  u[5] = _mm_add_epi32(v[5], v[21]);

+  u[6] = _mm_add_epi32(v[6], v[22]);

+  u[7] = _mm_add_epi32(v[7], v[23]);

+  u[8] = _mm_add_epi32(v[8], v[24]);

+  u[9] = _mm_add_epi32(v[9], v[25]);

+  u[10] = _mm_add_epi32(v[10], v[26]);

+  u[11] = _mm_add_epi32(v[11], v[27]);

+  u[12] = _mm_add_epi32(v[12], v[28]);

+  u[13] = _mm_add_epi32(v[13], v[29]);

+  u[14] = _mm_add_epi32(v[14], v[30]);

+  u[15] = _mm_add_epi32(v[15], v[31]);

+  u[16] = _mm_sub_epi32(v[0], v[16]);

+  u[17] = _mm_sub_epi32(v[1], v[17]);

+  u[18] = _mm_sub_epi32(v[2], v[18]);

+  u[19] = _mm_sub_epi32(v[3], v[19]);

+  u[20] = _mm_sub_epi32(v[4], v[20]);

+  u[21] = _mm_sub_epi32(v[5], v[21]);

+  u[22] = _mm_sub_epi32(v[6], v[22]);

+  u[23] = _mm_sub_epi32(v[7], v[23]);

+  u[24] = _mm_sub_epi32(v[8], v[24]);

+  u[25] = _mm_sub_epi32(v[9], v[25]);

+  u[26] = _mm_sub_epi32(v[10], v[26]);

+  u[27] = _mm_sub_epi32(v[11], v[27]);

+  u[28] = _mm_sub_epi32(v[12], v[28]);

+  u[29] = _mm_sub_epi32(v[13], v[29]);

+  u[30] = _mm_sub_epi32(v[14], v[30]);

+  u[31] = _mm_sub_epi32(v[15], v[31]);

+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);

+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);

+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);

+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);

+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);

+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);

+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);

+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);

+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);

+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);

+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);

+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);

+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);

+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);

+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);

+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);

+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);

+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);

+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);

+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);

+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);

+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);

+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);

+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);

+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);

+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);

+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);

+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);

+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);

+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);

+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);

+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);

+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);

+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);

+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);

+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);

+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);

+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);

+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);

+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);

+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);

+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);

+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);

+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);

+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);

+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);

+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);

+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);

+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);

+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);

+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);

+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);

+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);

+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);

+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);

+  s[0] = _mm_packs_epi32(u[0], u[1]);

+  s[1] = _mm_packs_epi32(u[2], u[3]);

+  s[2] = _mm_packs_epi32(u[4], u[5]);

+  s[3] = _mm_packs_epi32(u[6], u[7]);

+  s[4] = _mm_packs_epi32(u[8], u[9]);

+  s[5] = _mm_packs_epi32(u[10], u[11]);

+  s[6] = _mm_packs_epi32(u[12], u[13]);

+  s[7] = _mm_packs_epi32(u[14], u[15]);

+  s[8] = _mm_packs_epi32(u[16], u[17]);

+  s[9] = _mm_packs_epi32(u[18], u[19]);

+  s[10] = _mm_packs_epi32(u[20], u[21]);

+  s[11] = _mm_packs_epi32(u[22], u[23]);

+  s[12] = _mm_packs_epi32(u[24], u[25]);

+  s[13] = _mm_packs_epi32(u[26], u[27]);

+  s[14] = _mm_packs_epi32(u[28], u[29]);

+  s[15] = _mm_packs_epi32(u[30], u[31]);

+  // stage 2

+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);

+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);

+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);

+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);

+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);

+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);

+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);

+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);

+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);

+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);

+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);

+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);

+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);

+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);

+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);

+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);

+  u[0] = _mm_add_epi32(v[0], v[8]);

+  u[1] = _mm_add_epi32(v[1], v[9]);

+  u[2] = _mm_add_epi32(v[2], v[10]);

+  u[3] = _mm_add_epi32(v[3], v[11]);

+  u[4] = _mm_add_epi32(v[4], v[12]);

+  u[5] = _mm_add_epi32(v[5], v[13]);

+  u[6] = _mm_add_epi32(v[6], v[14]);

+  u[7] = _mm_add_epi32(v[7], v[15]);

+  u[8] = _mm_sub_epi32(v[0], v[8]);

+  u[9] = _mm_sub_epi32(v[1], v[9]);

+  u[10] = _mm_sub_epi32(v[2], v[10]);

+  u[11] = _mm_sub_epi32(v[3], v[11]);

+  u[12] = _mm_sub_epi32(v[4], v[12]);

+  u[13] = _mm_sub_epi32(v[5], v[13]);

+  u[14] = _mm_sub_epi32(v[6], v[14]);

+  u[15] = _mm_sub_epi32(v[7], v[15]);

+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);

+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);

+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);

+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);

+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);

+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);

+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);

+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);

+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);

+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);

+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);

+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);

+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);

+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);

+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);

+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);

+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);

+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);

+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);

+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);

+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);

+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);

+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);

+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);

+  x[0] = _mm_add_epi16(s[0], s[4]);

+  x[1] = _mm_add_epi16(s[1], s[5]);

+  x[2] = _mm_add_epi16(s[2], s[6]);

+  x[3] = _mm_add_epi16(s[3], s[7]);

+  x[4] = _mm_sub_epi16(s[0], s[4]);

+  x[5] = _mm_sub_epi16(s[1], s[5]);

+  x[6] = _mm_sub_epi16(s[2], s[6]);

+  x[7] = _mm_sub_epi16(s[3], s[7]);

+  x[8] = _mm_packs_epi32(u[0], u[1]);

+  x[9] = _mm_packs_epi32(u[2], u[3]);

+  x[10] = _mm_packs_epi32(u[4], u[5]);

+  x[11] = _mm_packs_epi32(u[6], u[7]);

+  x[12] = _mm_packs_epi32(u[8], u[9]);

+  x[13] = _mm_packs_epi32(u[10], u[11]);

+  x[14] = _mm_packs_epi32(u[12], u[13]);

+  x[15] = _mm_packs_epi32(u[14], u[15]);

+  // stage 3

+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);

+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);

+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);

+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);

+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);

+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);

+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);

+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);

+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);

+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);

+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);

+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);

+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);

+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);

+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);

+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);

+  u[0] = _mm_add_epi32(v[0], v[4]);

+  u[1] = _mm_add_epi32(v[1], v[5]);

+  u[2] = _mm_add_epi32(v[2], v[6]);

+  u[3] = _mm_add_epi32(v[3], v[7]);

+  u[4] = _mm_sub_epi32(v[0], v[4]);

+  u[5] = _mm_sub_epi32(v[1], v[5]);

+  u[6] = _mm_sub_epi32(v[2], v[6]);

+  u[7] = _mm_sub_epi32(v[3], v[7]);

+  u[8] = _mm_add_epi32(v[8], v[12]);

+  u[9] = _mm_add_epi32(v[9], v[13]);

+  u[10] = _mm_add_epi32(v[10], v[14]);

+  u[11] = _mm_add_epi32(v[11], v[15]);

+  u[12] = _mm_sub_epi32(v[8], v[12]);

+  u[13] = _mm_sub_epi32(v[9], v[13]);

+  u[14] = _mm_sub_epi32(v[10], v[14]);

+  u[15] = _mm_sub_epi32(v[11], v[15]);

+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);

+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);

+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);

+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);

+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);

+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);

+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);

+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);

+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);

+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);

+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);

+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);

+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);

+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

+  s[0] = _mm_add_epi16(x[0], x[2]);

+  s[1] = _mm_add_epi16(x[1], x[3]);

+  s[2] = _mm_sub_epi16(x[0], x[2]);

+  s[3] = _mm_sub_epi16(x[1], x[3]);

+  s[4] = _mm_packs_epi32(v[0], v[1]);

+  s[5] = _mm_packs_epi32(v[2], v[3]);

+  s[6] = _mm_packs_epi32(v[4], v[5]);

+  s[7] = _mm_packs_epi32(v[6], v[7]);

+  s[8] = _mm_add_epi16(x[8], x[10]);

+  s[9] = _mm_add_epi16(x[9], x[11]);

+  s[10] = _mm_sub_epi16(x[8], x[10]);

+  s[11] = _mm_sub_epi16(x[9], x[11]);

+  s[12] = _mm_packs_epi32(v[8], v[9]);

+  s[13] = _mm_packs_epi32(v[10], v[11]);

+  s[14] = _mm_packs_epi32(v[12], v[13]);

+  s[15] = _mm_packs_epi32(v[14], v[15]);

+  // stage 4

+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);

+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);

+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);

+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);

+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);

+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);

+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);

+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);

+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);

+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);

+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);

+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);

+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);

+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);

+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);

+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);

+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);

+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);

+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);

+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);

+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);

+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);

+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);

+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);

+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

+  in[0] = s[0];

+  in[1] = _mm_sub_epi16(kZero, s[8]);

+  in[2] = s[12];

+  in[3] = _mm_sub_epi16(kZero, s[4]);

+  in[4] = _mm_packs_epi32(v[4], v[5]);

+  in[5] = _mm_packs_epi32(v[12], v[13]);

+  in[6] = _mm_packs_epi32(v[8], v[9]);

+  in[7] = _mm_packs_epi32(v[0], v[1]);

+  in[8] = _mm_packs_epi32(v[2], v[3]);

+  in[9] = _mm_packs_epi32(v[10], v[11]);

+  in[10] = _mm_packs_epi32(v[14], v[15]);

+  in[11] = _mm_packs_epi32(v[6], v[7]);

+  in[12] = s[5];

+  in[13] = _mm_sub_epi16(kZero, s[13]);

+  in[14] = s[9];

+  in[15] = _mm_sub_epi16(kZero, s[1]);

+}

+static void idct16_8col(__m128i *in) {

+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);

+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);

+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);

+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);

+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);

+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);

+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);

+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i v[16], u[16], s[16], t[16];

+  // stage 1

+  s[0] = in[0];

+  s[1] = in[8];

+  s[2] = in[4];

+  s[3] = in[12];

+  s[4] = in[2];

+  s[5] = in[10];

+  s[6] = in[6];

+  s[7] = in[14];

+  s[8] = in[1];

+  s[9] = in[9];

+  s[10] = in[5];

+  s[11] = in[13];

+  s[12] = in[3];

+  s[13] = in[11];

+  s[14] = in[7];

+  s[15] = in[15];

+  // stage 2

+  u[0] = _mm_unpacklo_epi16(s[8], s[15]);

+  u[1] = _mm_unpackhi_epi16(s[8], s[15]);

+  u[2] = _mm_unpacklo_epi16(s[9], s[14]);

+  u[3] = _mm_unpackhi_epi16(s[9], s[14]);

+  u[4] = _mm_unpacklo_epi16(s[10], s[13]);

+  u[5] = _mm_unpackhi_epi16(s[10], s[13]);

+  u[6] = _mm_unpacklo_epi16(s[11], s[12]);

+  u[7] = _mm_unpackhi_epi16(s[11], s[12]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);

+  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);

+  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);

+  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);

+  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);

+  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);

+  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);

+  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);

+  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);

+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);

+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);

+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);

+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);

+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);

+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);

+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);

+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

+  s[8]  = _mm_packs_epi32(u[0], u[1]);

+  s[15] = _mm_packs_epi32(u[2], u[3]);

+  s[9]  = _mm_packs_epi32(u[4], u[5]);

+  s[14] = _mm_packs_epi32(u[6], u[7]);

+  s[10] = _mm_packs_epi32(u[8], u[9]);

+  s[13] = _mm_packs_epi32(u[10], u[11]);

+  s[11] = _mm_packs_epi32(u[12], u[13]);

+  s[12] = _mm_packs_epi32(u[14], u[15]);

+  // stage 3

+  t[0] = s[0];

+  t[1] = s[1];

+  t[2] = s[2];

+  t[3] = s[3];

+  u[0] = _mm_unpacklo_epi16(s[4], s[7]);

+  u[1] = _mm_unpackhi_epi16(s[4], s[7]);

+  u[2] = _mm_unpacklo_epi16(s[5], s[6]);

+  u[3] = _mm_unpackhi_epi16(s[5], s[6]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);

+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

+  t[4] = _mm_packs_epi32(u[0], u[1]);

+  t[7] = _mm_packs_epi32(u[2], u[3]);

+  t[5] = _mm_packs_epi32(u[4], u[5]);

+  t[6] = _mm_packs_epi32(u[6], u[7]);

+  t[8] = _mm_add_epi16(s[8], s[9]);

+  t[9] = _mm_sub_epi16(s[8], s[9]);

+  t[10] = _mm_sub_epi16(s[11], s[10]);

+  t[11] = _mm_add_epi16(s[10], s[11]);

+  t[12] = _mm_add_epi16(s[12], s[13]);

+  t[13] = _mm_sub_epi16(s[12], s[13]);

+  t[14] = _mm_sub_epi16(s[15], s[14]);

+  t[15] = _mm_add_epi16(s[14], s[15]);

+  // stage 4

+  u[0] = _mm_unpacklo_epi16(t[0], t[1]);

+  u[1] = _mm_unpackhi_epi16(t[0], t[1]);

+  u[2] = _mm_unpacklo_epi16(t[2], t[3]);

+  u[3] = _mm_unpackhi_epi16(t[2], t[3]);

+  u[4] = _mm_unpacklo_epi16(t[9], t[14]);

+  u[5] = _mm_unpackhi_epi16(t[9], t[14]);

+  u[6] = _mm_unpacklo_epi16(t[10], t[13]);

+  u[7] = _mm_unpackhi_epi16(t[10], t[13]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);

+  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);

+  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);

+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);

+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);

+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);

+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);

+  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);

+  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);

+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);

+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);

+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);

+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);

+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);

+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);

+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);

+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);

+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);

+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);

+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);

+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);

+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);

+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);

+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);

+  s[0] = _mm_packs_epi32(u[0], u[1]);

+  s[1] = _mm_packs_epi32(u[2], u[3]);

+  s[2] = _mm_packs_epi32(u[4], u[5]);

+  s[3] = _mm_packs_epi32(u[6], u[7]);

+  s[4] = _mm_add_epi16(t[4], t[5]);

+  s[5] = _mm_sub_epi16(t[4], t[5]);

+  s[6] = _mm_sub_epi16(t[7], t[6]);

+  s[7] = _mm_add_epi16(t[6], t[7]);

+  s[8] = t[8];

+  s[15] = t[15];

+  s[9]  = _mm_packs_epi32(u[8], u[9]);

+  s[14] = _mm_packs_epi32(u[10], u[11]);

+  s[10] = _mm_packs_epi32(u[12], u[13]);

+  s[13] = _mm_packs_epi32(u[14], u[15]);

+  s[11] = t[11];

+  s[12] = t[12];

+  // stage 5

+  t[0] = _mm_add_epi16(s[0], s[3]);

+  t[1] = _mm_add_epi16(s[1], s[2]);

+  t[2] = _mm_sub_epi16(s[1], s[2]);

+  t[3] = _mm_sub_epi16(s[0], s[3]);

+  t[4] = s[4];

+  t[7] = s[7];

+  u[0] = _mm_unpacklo_epi16(s[5], s[6]);

+  u[1] = _mm_unpackhi_epi16(s[5], s[6]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);

+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  t[5] = _mm_packs_epi32(u[0], u[1]);

+  t[6] = _mm_packs_epi32(u[2], u[3]);

+  t[8] = _mm_add_epi16(s[8], s[11]);

+  t[9] = _mm_add_epi16(s[9], s[10]);

+  t[10] = _mm_sub_epi16(s[9], s[10]);

+  t[11] = _mm_sub_epi16(s[8], s[11]);

+  t[12] = _mm_sub_epi16(s[15], s[12]);

+  t[13] = _mm_sub_epi16(s[14], s[13]);

+  t[14] = _mm_add_epi16(s[13], s[14]);

+  t[15] = _mm_add_epi16(s[12], s[15]);

+  // stage 6

+  s[0] = _mm_add_epi16(t[0], t[7]);

+  s[1] = _mm_add_epi16(t[1], t[6]);

+  s[2] = _mm_add_epi16(t[2], t[5]);

+  s[3] = _mm_add_epi16(t[3], t[4]);

+  s[4] = _mm_sub_epi16(t[3], t[4]);

+  s[5] = _mm_sub_epi16(t[2], t[5]);

+  s[6] = _mm_sub_epi16(t[1], t[6]);

+  s[7] = _mm_sub_epi16(t[0], t[7]);

+  s[8] = t[8];

+  s[9] = t[9];

+  u[0] = _mm_unpacklo_epi16(t[10], t[13]);

+  u[1] = _mm_unpackhi_epi16(t[10], t[13]);

+  u[2] = _mm_unpacklo_epi16(t[11], t[12]);

+  u[3] = _mm_unpackhi_epi16(t[11], t[12]);

+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);

+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);

+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);

+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);

+  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);

+  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);

+  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);

+  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);

+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);

+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);

+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);

+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);

+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);

+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);

+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);

+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);

+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);

+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);

+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);

+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);

+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);

+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);

+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);

+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);

+  s[10] = _mm_packs_epi32(u[0], u[1]);

+  s[13] = _mm_packs_epi32(u[2], u[3]);

+  s[11] = _mm_packs_epi32(u[4], u[5]);

+  s[12] = _mm_packs_epi32(u[6], u[7]);

+  s[14] = t[14];

+  s[15] = t[15];

+  // stage 7

+  in[0] = _mm_add_epi16(s[0], s[15]);

+  in[1] = _mm_add_epi16(s[1], s[14]);

+  in[2] = _mm_add_epi16(s[2], s[13]);

+  in[3] = _mm_add_epi16(s[3], s[12]);

+  in[4] = _mm_add_epi16(s[4], s[11]);

+  in[5] = _mm_add_epi16(s[5], s[10]);

+  in[6] = _mm_add_epi16(s[6], s[9]);

+  in[7] = _mm_add_epi16(s[7], s[8]);

+  in[8] = _mm_sub_epi16(s[7], s[8]);

+  in[9] = _mm_sub_epi16(s[6], s[9]);

+  in[10] = _mm_sub_epi16(s[5], s[10]);

+  in[11] = _mm_sub_epi16(s[4], s[11]);

+  in[12] = _mm_sub_epi16(s[3], s[12]);

+  in[13] = _mm_sub_epi16(s[2], s[13]);

+  in[14] = _mm_sub_epi16(s[1], s[14]);

+  in[15] = _mm_sub_epi16(s[0], s[15]);

+}

+void idct16_sse2(__m128i *in0, __m128i *in1) {

+  array_transpose_16x16(in0, in1);

+  idct16_8col(in0);

+  idct16_8col(in1);

+}

+void iadst16_sse2(__m128i *in0, __m128i *in1) {

+  array_transpose_16x16(in0, in1);

+  iadst16_8col(in0);

+  iadst16_8col(in1);

+}

+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,

+                               int stride) {

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in[16], l[16];

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,

+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

+          stp1_8_0, stp1_12_0;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i;

+  // First 1-D inverse DCT

+  // Load input data.

+  in[0] = _mm_load_si128((const __m128i *)input);

+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));

+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));

+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));

+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);

+  // Stage2

+  {

+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);

+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);

+    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);

+    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);

+    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);

+    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp5 = _mm_add_epi32(tmp5, rounding);

+    tmp7 = _mm_add_epi32(tmp7, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);

+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);

+    stp2_8  = _mm_packs_epi32(tmp0, tmp2);

+    stp2_11 = _mm_packs_epi32(tmp5, tmp7);

+  }

+  // Stage3

+  {

+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);

+    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);

+    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);

+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);

+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);

+  }

+  // Stage4

+  {

+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);

+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);

+    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);

+    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);

+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);

+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);

+    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);

+    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp1 = _mm_add_epi32(tmp1, rounding);

+    tmp3 = _mm_add_epi32(tmp3, rounding);

+    tmp5 = _mm_add_epi32(tmp5, rounding);

+    tmp7 = _mm_add_epi32(tmp7, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);

+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);

+    stp1_0 = _mm_packs_epi32(tmp0, tmp0);

+    stp1_1 = _mm_packs_epi32(tmp2, tmp2);

+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);

+    stp2_10 = _mm_packs_epi32(tmp5, tmp7);

+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);

+  }

+  // Stage5 and Stage6

+  {

+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);

+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);

+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);

+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);

+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);

+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);

+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);

+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);

+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);

+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);

+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);

+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);

+  }

+  // Stage6

+  {

+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);

+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);

+    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);

+    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);

+    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);

+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);

+    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);

+    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);

+    tmp1 = _mm_add_epi32(tmp1, rounding);

+    tmp3 = _mm_add_epi32(tmp3, rounding);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    stp1_6 = _mm_packs_epi32(tmp3, tmp1);

+    stp2_10 = _mm_packs_epi32(tmp0, zero);

+    stp2_13 = _mm_packs_epi32(tmp2, zero);

+    stp2_11 = _mm_packs_epi32(tmp4, zero);

+    stp2_12 = _mm_packs_epi32(tmp6, zero);

+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);

+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);

+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);

+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);

+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);

+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);

+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);

+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);

+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);

+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);

+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);

+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);

+  }

+  // Stage7. Left 8x16 only.

+  l[0] = _mm_add_epi16(stp2_0, stp1_15);

+  l[1] = _mm_add_epi16(stp2_1, stp1_14);

+  l[2] = _mm_add_epi16(stp2_2, stp2_13);

+  l[3] = _mm_add_epi16(stp2_3, stp2_12);

+  l[4] = _mm_add_epi16(stp2_4, stp2_11);

+  l[5] = _mm_add_epi16(stp2_5, stp2_10);

+  l[6] = _mm_add_epi16(stp2_6, stp1_9);

+  l[7] = _mm_add_epi16(stp2_7, stp1_8);

+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);

+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);

+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);

+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);

+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);

+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);

+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);

+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);

+  // Second 1-D inverse transform, performed per 8x16 block

+  for (i = 0; i < 2; i++) {

+    int j;

+    array_transpose_4X8(l + 8 * i, in);

+    IDCT16_10

+    // Stage7

+    in[0] = _mm_add_epi16(stp2_0, stp1_15);

+    in[1] = _mm_add_epi16(stp2_1, stp1_14);

+    in[2] = _mm_add_epi16(stp2_2, stp2_13);

+    in[3] = _mm_add_epi16(stp2_3, stp2_12);

+    in[4] = _mm_add_epi16(stp2_4, stp2_11);

+    in[5] = _mm_add_epi16(stp2_5, stp2_10);

+    in[6] = _mm_add_epi16(stp2_6, stp1_9);

+    in[7] = _mm_add_epi16(stp2_7, stp1_8);

+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);

+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);

+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);

+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);

+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);

+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);

+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);

+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);

+    for (j = 0; j < 16; ++j) {

+      // Final rounding and shift

+      in[j] = _mm_adds_epi16(in[j], final_rounding);

+      in[j] = _mm_srai_epi16(in[j], 6);

+      RECON_AND_STORE(dest + j * stride, in[j]);

+    }

+    dest += 8;

+  }

+}

+#define LOAD_DQCOEFF(reg, input) \

+  {  \

+    reg = _mm_load_si128((const __m128i *) input); \

+    input += 8; \

+  }  \

+#define IDCT32_34 \

+/* Stage1 */ \

+{ \

+  const __m128i zero = _mm_setzero_si128();\

+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \

+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \

+  \

+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \

+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \

+  \

+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \

+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \

+  \

+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \

+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \

+  \

+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \

+                         stg1_1, stp1_16, stp1_31); \

+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \

+                         stg1_7, stp1_19, stp1_28); \

+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \

+                         stg1_9, stp1_20, stp1_27); \

+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \

+                         stg1_15, stp1_23, stp1_24); \

+} \

+\

+/* Stage2 */ \

+{ \

+  const __m128i zero = _mm_setzero_si128();\

+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \

+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \

+  \

+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \

+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \

+  \

+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \

+                         stg2_1, stp2_8, stp2_15); \

+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \

+                         stg2_7, stp2_11, stp2_12); \

+  \

+  stp2_16 = stp1_16; \

+  stp2_19 = stp1_19; \

+  \

+  stp2_20 = stp1_20; \

+  stp2_23 = stp1_23; \

+  \

+  stp2_24 = stp1_24; \

+  stp2_27 = stp1_27; \

+  \

+  stp2_28 = stp1_28; \

+  stp2_31 = stp1_31; \

+} \

+\

+/* Stage3 */ \

+{ \

+  const __m128i zero = _mm_setzero_si128();\

+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \

+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \

+  \

+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \

+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \

+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \

+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \

+  \

+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \

+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \

+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \

+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \

+  \

+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \

+                         stg3_1, stp1_4, stp1_7); \

+  \

+  stp1_8 = stp2_8; \

+  stp1_11 = stp2_11; \

+  stp1_12 = stp2_12; \

+  stp1_15 = stp2_15; \

+  \

+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \

+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \

+                         stp1_18, stp1_29) \

+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \

+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \

+                         stp1_22, stp1_25) \

+  \

+  stp1_16 = stp2_16; \

+  stp1_31 = stp2_31; \

+  stp1_19 = stp2_19; \

+  stp1_20 = stp2_20; \

+  stp1_23 = stp2_23; \

+  stp1_24 = stp2_24; \

+  stp1_27 = stp2_27; \

+  stp1_28 = stp2_28; \

+} \

+\

+/* Stage4 */ \

+{ \

+  const __m128i zero = _mm_setzero_si128();\

+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \

+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \

+  \

+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \

+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \

+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \

+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \

+  \

+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \

+                         stg4_1, stp2_0, stp2_1); \

+  \

+  stp2_4 = stp1_4; \

+  stp2_5 = stp1_4; \

+  stp2_6 = stp1_7; \

+  stp2_7 = stp1_7; \

+  \

+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \

+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \

+                         stp2_10, stp2_13) \

+  \

+  stp2_8 = stp1_8; \

+  stp2_15 = stp1_15; \

+  stp2_11 = stp1_11; \

+  stp2_12 = stp1_12; \

+  \

+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \

+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \

+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \

+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \

+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \

+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \

+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \

+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \

+  \

+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \

+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \

+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \

+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \

+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \

+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \

+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \

+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \

+} \

+\

+/* Stage5 */ \

+{ \

+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \

+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \

+  \

+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \

+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \

+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

+  \

+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

+  \

+  stp1_0 = stp2_0; \

+  stp1_1 = stp2_1; \

+  stp1_2 = stp2_1; \

+  stp1_3 = stp2_0; \

+  \

+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

+  \

+  tmp0 = _mm_add_epi32(tmp0, rounding); \

+  tmp1 = _mm_add_epi32(tmp1, rounding); \

+  tmp2 = _mm_add_epi32(tmp2, rounding); \

+  tmp3 = _mm_add_epi32(tmp3, rounding); \

+  \

+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+  \

+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

+  \

+  stp1_4 = stp2_4; \

+  stp1_7 = stp2_7; \

+  \

+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \

+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \

+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \

+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \

+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \

+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \

+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \

+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \

+  \

+  stp1_16 = stp2_16; \

+  stp1_17 = stp2_17; \

+  \

+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \

+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \

+                         stp1_19, stp1_28) \

+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \

+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \

+                         stp1_21, stp1_26) \

+  \

+  stp1_22 = stp2_22; \

+  stp1_23 = stp2_23; \

+  stp1_24 = stp2_24; \

+  stp1_25 = stp2_25; \

+  stp1_30 = stp2_30; \

+  stp1_31 = stp2_31; \

+} \

+\

+/* Stage6 */ \

+{ \

+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

+  \

+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \

+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \

+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \

+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \

+  \

+  stp2_8 = stp1_8; \

+  stp2_9 = stp1_9; \

+  stp2_14 = stp1_14; \

+  stp2_15 = stp1_15; \

+  \

+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \

+                         stp2_13, stp2_11, stp2_12) \

+  \

+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \

+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \

+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \

+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \

+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \

+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \

+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \

+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \

+  \

+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \

+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \

+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \

+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \

+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \

+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \

+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \

+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \

+} \

+\

+/* Stage7 */ \

+{ \

+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

+  \

+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \

+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \

+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \

+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \

+  \

+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \

+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \

+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \

+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \

+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \

+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \

+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \

+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \

+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \

+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \

+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \

+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \

+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \

+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \

+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \

+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \

+  \

+  stp1_16 = stp2_16; \

+  stp1_17 = stp2_17; \

+  stp1_18 = stp2_18; \

+  stp1_19 = stp2_19; \

+  \

+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \

+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \

+                         stp1_21, stp1_26) \

+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \

+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \

+                         stp1_23, stp1_24) \

+  \

+  stp1_28 = stp2_28; \

+  stp1_29 = stp2_29; \

+  stp1_30 = stp2_30; \

+  stp1_31 = stp2_31; \

+}

+#define IDCT32 \

+/* Stage1 */ \

+{ \

+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \

+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \

+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \

+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \

+  \

+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \

+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \

+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \

+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \

+  \

+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \

+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \

+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \

+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \

+  \

+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \

+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \

+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \

+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \

+  \

+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \

+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \

+                         stp1_17, stp1_30) \

+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \

+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \

+                         stp1_19, stp1_28) \

+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \

+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \

+                         stp1_21, stp1_26) \

+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \

+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \

+                         stp1_23, stp1_24) \

+} \

+\

+/* Stage2 */ \

+{ \

+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \

+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \

+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \

+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \

+  \

+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \

+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \

+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \

+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \

+  \

+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \

+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \

+                         stp2_14) \

+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \

+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \

+                         stp2_11, stp2_12) \

+  \

+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \

+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \

+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \

+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \

+  \

+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \

+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \

+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \

+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \

+  \

+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \

+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \

+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \

+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \

+  \

+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \

+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \

+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \

+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \

+} \

+\

+/* Stage3 */ \

+{ \

+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \

+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \

+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \

+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \

+  \

+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \

+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \

+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \

+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \

+  \

+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \

+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \

+  \

+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \

+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \

+                         stp1_6) \

+  \

+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \

+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \

+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \

+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \

+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \

+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \

+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \

+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \

+  \

+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \

+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \

+                         stp1_18, stp1_29) \

+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \

+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \

+                         stp1_22, stp1_25) \

+  \

+  stp1_16 = stp2_16; \

+  stp1_31 = stp2_31; \

+  stp1_19 = stp2_19; \

+  stp1_20 = stp2_20; \

+  stp1_23 = stp2_23; \

+  stp1_24 = stp2_24; \

+  stp1_27 = stp2_27; \

+  stp1_28 = stp2_28; \

+} \

+\

+/* Stage4 */ \

+{ \

+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \

+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \

+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \

+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \

+  \

+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \

+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \

+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+  \

+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \

+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \

+                         stp2_2, stp2_3) \

+  \

+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \

+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \

+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \

+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \

+  \

+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \

+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \

+                         stp2_10, stp2_13) \

+  \

+  stp2_8 = stp1_8; \

+  stp2_15 = stp1_15; \

+  stp2_11 = stp1_11; \

+  stp2_12 = stp1_12; \

+  \

+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \

+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \

+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \

+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \

+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \

+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \

+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \

+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \

+  \

+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \

+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \

+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \

+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \

+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \

+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \

+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \

+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \

+} \

+\

+/* Stage5 */ \

+{ \

+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \

+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \

+  \

+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \

+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \

+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

+  \

+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

+  \

+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \

+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \

+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \

+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \

+  \

+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

+  \

+  tmp0 = _mm_add_epi32(tmp0, rounding); \

+  tmp1 = _mm_add_epi32(tmp1, rounding); \

+  tmp2 = _mm_add_epi32(tmp2, rounding); \

+  tmp3 = _mm_add_epi32(tmp3, rounding); \

+  \

+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+  \

+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

+  \

+  stp1_4 = stp2_4; \

+  stp1_7 = stp2_7; \

+  \

+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \

+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \

+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \

+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \

+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \

+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \

+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \

+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \

+  \

+  stp1_16 = stp2_16; \

+  stp1_17 = stp2_17; \

+  \

+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \

+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \

+                         stp1_19, stp1_28) \

+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \

+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \

+                         stp1_21, stp1_26) \

+  \

+  stp1_22 = stp2_22; \

+  stp1_23 = stp2_23; \

+  stp1_24 = stp2_24; \

+  stp1_25 = stp2_25; \

+  stp1_30 = stp2_30; \

+  stp1_31 = stp2_31; \

+} \

+\

+/* Stage6 */ \

+{ \

+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

+  \

+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \

+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \

+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \

+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \

+  \

+  stp2_8 = stp1_8; \

+  stp2_9 = stp1_9; \

+  stp2_14 = stp1_14; \

+  stp2_15 = stp1_15; \

+  \

+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \

+                         stp2_13, stp2_11, stp2_12) \

+  \

+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \

+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \

+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \

+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \

+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \

+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \

+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \

+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \

+  \

+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \

+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \

+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \

+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \

+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \

+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \

+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \

+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \

+} \

+\

+/* Stage7 */ \

+{ \

+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \

+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \

+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \

+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \

+  \

+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \

+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \

+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \

+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \

+  \

+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \

+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \

+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \

+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \

+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \

+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \

+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \

+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \

+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \

+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \

+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \

+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \

+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \

+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \

+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \

+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \

+  \

+  stp1_16 = stp2_16; \

+  stp1_17 = stp2_17; \

+  stp1_18 = stp2_18; \

+  stp1_19 = stp2_19; \

+  \

+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \

+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \

+                         stp1_21, stp1_26) \

+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \

+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \

+                         stp1_23, stp1_24) \

+  \

+  stp1_28 = stp2_28; \

+  stp1_29 = stp2_29; \

+  stp1_30 = stp2_30; \

+  stp1_31 = stp2_31; \

+}

+// Only upper-left 8x8 has non-zero coeff

+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,

+                               int stride) {

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1<<5);

+  // idct constants for each stage

+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);

+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);

+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);

+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);

+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);

+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);

+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);

+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);

+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);

+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);

+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in[32], col[32];

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,

+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,

+          stp1_30, stp1_31;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,

+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,

+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,

+          stp2_30, stp2_31;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i;

+  // Load input data. Only need to load the top left 8x8 block.

+  in[0] = _mm_load_si128((const __m128i *)input);

+  in[1] = _mm_load_si128((const __m128i *)(input + 32));

+  in[2] = _mm_load_si128((const __m128i *)(input + 64));

+  in[3] = _mm_load_si128((const __m128i *)(input + 96));

+  in[4] = _mm_load_si128((const __m128i *)(input + 128));

+  in[5] = _mm_load_si128((const __m128i *)(input + 160));

+  in[6] = _mm_load_si128((const __m128i *)(input + 192));

+  in[7] = _mm_load_si128((const __m128i *)(input + 224));

+  for (i = 8; i < 32; ++i) {

+    in[i] = _mm_setzero_si128();

+  }

+  array_transpose_8x8(in, in);

+  // TODO(hkuang): Following transposes are unnecessary. But remove them will

+  // lead to performance drop on some devices.

+  array_transpose_8x8(in + 8, in + 8);

+  array_transpose_8x8(in + 16, in + 16);

+  array_transpose_8x8(in + 24, in + 24);

+  IDCT32_34

+  // 1_D: Store 32 intermediate results for each 8x32 block.

+  col[0] = _mm_add_epi16(stp1_0, stp1_31);

+  col[1] = _mm_add_epi16(stp1_1, stp1_30);

+  col[2] = _mm_add_epi16(stp1_2, stp1_29);

+  col[3] = _mm_add_epi16(stp1_3, stp1_28);

+  col[4] = _mm_add_epi16(stp1_4, stp1_27);

+  col[5] = _mm_add_epi16(stp1_5, stp1_26);

+  col[6] = _mm_add_epi16(stp1_6, stp1_25);

+  col[7] = _mm_add_epi16(stp1_7, stp1_24);

+  col[8] = _mm_add_epi16(stp1_8, stp1_23);

+  col[9] = _mm_add_epi16(stp1_9, stp1_22);

+  col[10] = _mm_add_epi16(stp1_10, stp1_21);

+  col[11] = _mm_add_epi16(stp1_11, stp1_20);

+  col[12] = _mm_add_epi16(stp1_12, stp1_19);

+  col[13] = _mm_add_epi16(stp1_13, stp1_18);

+  col[14] = _mm_add_epi16(stp1_14, stp1_17);

+  col[15] = _mm_add_epi16(stp1_15, stp1_16);

+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);

+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);

+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);

+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);

+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);

+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);

+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);

+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);

+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);

+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);

+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);

+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);

+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);

+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);

+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);

+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);

+  for (i = 0; i < 4; i++) {

+    int j;

+    const __m128i zero = _mm_setzero_si128();

+    // Transpose 32x8 block to 8x32 block

+    array_transpose_8x8(col + i * 8, in);

+    IDCT32_34

+    // 2_D: Calculate the results and store them to destination.

+    in[0] = _mm_add_epi16(stp1_0, stp1_31);

+    in[1] = _mm_add_epi16(stp1_1, stp1_30);

+    in[2] = _mm_add_epi16(stp1_2, stp1_29);

+    in[3] = _mm_add_epi16(stp1_3, stp1_28);

+    in[4] = _mm_add_epi16(stp1_4, stp1_27);

+    in[5] = _mm_add_epi16(stp1_5, stp1_26);

+    in[6] = _mm_add_epi16(stp1_6, stp1_25);

+    in[7] = _mm_add_epi16(stp1_7, stp1_24);

+    in[8] = _mm_add_epi16(stp1_8, stp1_23);

+    in[9] = _mm_add_epi16(stp1_9, stp1_22);

+    in[10] = _mm_add_epi16(stp1_10, stp1_21);

+    in[11] = _mm_add_epi16(stp1_11, stp1_20);

+    in[12] = _mm_add_epi16(stp1_12, stp1_19);

+    in[13] = _mm_add_epi16(stp1_13, stp1_18);

+    in[14] = _mm_add_epi16(stp1_14, stp1_17);

+    in[15] = _mm_add_epi16(stp1_15, stp1_16);

+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);

+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);

+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);

+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);

+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);

+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);

+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);

+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);

+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);

+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);

+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);

+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);

+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);

+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);

+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);

+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);

+    for (j = 0; j < 32; ++j) {

+      // Final rounding and shift

+      in[j] = _mm_adds_epi16(in[j], final_rounding);

+      in[j] = _mm_srai_epi16(in[j], 6);

+      RECON_AND_STORE(dest + j * stride, in[j]);

+    }

+    dest += 8;

+  }

+}

+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,

+                                 int stride) {

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);

+  const __m128i zero = _mm_setzero_si128();

+  // idct constants for each stage

+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);

+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);

+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);

+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);

+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);

+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);

+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);

+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);

+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);

+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);

+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);

+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);

+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);

+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);

+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);

+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);

+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);

+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);

+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);

+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);

+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);

+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);

+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);

+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in[32], col[128], zero_idx[16];

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,

+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,

+          stp1_30, stp1_31;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,

+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,

+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,

+          stp2_30, stp2_31;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i, j, i32;

+  for (i = 0; i < 4; i++) {

+    i32 = (i << 5);

+    // First 1-D idct

+    // Load input data.

+    LOAD_DQCOEFF(in[0], input);

+    LOAD_DQCOEFF(in[8], input);

+    LOAD_DQCOEFF(in[16], input);

+    LOAD_DQCOEFF(in[24], input);

+    LOAD_DQCOEFF(in[1], input);

+    LOAD_DQCOEFF(in[9], input);

+    LOAD_DQCOEFF(in[17], input);

+    LOAD_DQCOEFF(in[25], input);

+    LOAD_DQCOEFF(in[2], input);

+    LOAD_DQCOEFF(in[10], input);

+    LOAD_DQCOEFF(in[18], input);

+    LOAD_DQCOEFF(in[26], input);

+    LOAD_DQCOEFF(in[3], input);

+    LOAD_DQCOEFF(in[11], input);

+    LOAD_DQCOEFF(in[19], input);

+    LOAD_DQCOEFF(in[27], input);

+    LOAD_DQCOEFF(in[4], input);

+    LOAD_DQCOEFF(in[12], input);

+    LOAD_DQCOEFF(in[20], input);

+    LOAD_DQCOEFF(in[28], input);

+    LOAD_DQCOEFF(in[5], input);

+    LOAD_DQCOEFF(in[13], input);

+    LOAD_DQCOEFF(in[21], input);

+    LOAD_DQCOEFF(in[29], input);

+    LOAD_DQCOEFF(in[6], input);

+    LOAD_DQCOEFF(in[14], input);

+    LOAD_DQCOEFF(in[22], input);

+    LOAD_DQCOEFF(in[30], input);

+    LOAD_DQCOEFF(in[7], input);

+    LOAD_DQCOEFF(in[15], input);

+    LOAD_DQCOEFF(in[23], input);

+    LOAD_DQCOEFF(in[31], input);

+    // checking if all entries are zero

+    zero_idx[0] = _mm_or_si128(in[0], in[1]);

+    zero_idx[1] = _mm_or_si128(in[2], in[3]);

+    zero_idx[2] = _mm_or_si128(in[4], in[5]);

+    zero_idx[3] = _mm_or_si128(in[6], in[7]);

+    zero_idx[4] = _mm_or_si128(in[8], in[9]);

+    zero_idx[5] = _mm_or_si128(in[10], in[11]);

+    zero_idx[6] = _mm_or_si128(in[12], in[13]);

+    zero_idx[7] = _mm_or_si128(in[14], in[15]);

+    zero_idx[8] = _mm_or_si128(in[16], in[17]);

+    zero_idx[9] = _mm_or_si128(in[18], in[19]);

+    zero_idx[10] = _mm_or_si128(in[20], in[21]);

+    zero_idx[11] = _mm_or_si128(in[22], in[23]);

+    zero_idx[12] = _mm_or_si128(in[24], in[25]);

+    zero_idx[13] = _mm_or_si128(in[26], in[27]);

+    zero_idx[14] = _mm_or_si128(in[28], in[29]);

+    zero_idx[15] = _mm_or_si128(in[30], in[31]);

+    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);

+    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);

+    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);

+    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);

+    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);

+    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);

+    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);

+    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);

+    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);

+    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);

+    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);

+    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);

+    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);

+    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);

+    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);

+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {

+      col[i32 + 0] = _mm_setzero_si128();

+      col[i32 + 1] = _mm_setzero_si128();

+      col[i32 + 2] = _mm_setzero_si128();

+      col[i32 + 3] = _mm_setzero_si128();

+      col[i32 + 4] = _mm_setzero_si128();

+      col[i32 + 5] = _mm_setzero_si128();

+      col[i32 + 6] = _mm_setzero_si128();

+      col[i32 + 7] = _mm_setzero_si128();

+      col[i32 + 8] = _mm_setzero_si128();

+      col[i32 + 9] = _mm_setzero_si128();

+      col[i32 + 10] = _mm_setzero_si128();

+      col[i32 + 11] = _mm_setzero_si128();

+      col[i32 + 12] = _mm_setzero_si128();

+      col[i32 + 13] = _mm_setzero_si128();

+      col[i32 + 14] = _mm_setzero_si128();

+      col[i32 + 15] = _mm_setzero_si128();

+      col[i32 + 16] = _mm_setzero_si128();

+      col[i32 + 17] = _mm_setzero_si128();

+      col[i32 + 18] = _mm_setzero_si128();

+      col[i32 + 19] = _mm_setzero_si128();

+      col[i32 + 20] = _mm_setzero_si128();

+      col[i32 + 21] = _mm_setzero_si128();

+      col[i32 + 22] = _mm_setzero_si128();

+      col[i32 + 23] = _mm_setzero_si128();

+      col[i32 + 24] = _mm_setzero_si128();

+      col[i32 + 25] = _mm_setzero_si128();

+      col[i32 + 26] = _mm_setzero_si128();

+      col[i32 + 27] = _mm_setzero_si128();

+      col[i32 + 28] = _mm_setzero_si128();

+      col[i32 + 29] = _mm_setzero_si128();

+      col[i32 + 30] = _mm_setzero_si128();

+      col[i32 + 31] = _mm_setzero_si128();

+      continue;

+    }

+    // Transpose 32x8 block to 8x32 block

+    array_transpose_8x8(in, in);

+    array_transpose_8x8(in + 8, in + 8);

+    array_transpose_8x8(in + 16, in + 16);

+    array_transpose_8x8(in + 24, in + 24);

+    IDCT32

+    // 1_D: Store 32 intermediate results for each 8x32 block.

+    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);

+    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);

+    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);

+    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);

+    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);

+    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);

+    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);

+    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);

+    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);

+    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);

+    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);

+    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);

+    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);

+    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);

+    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);

+    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);

+    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);

+    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);

+    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);

+    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);

+    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);

+    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);

+    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);

+    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);

+    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);

+    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);

+    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);

+    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);

+    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);

+    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);

+    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);

+    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);

+  }

+  for (i = 0; i < 4; i++) {

+    // Second 1-D idct

+    j = i << 3;

+    // Transpose 32x8 block to 8x32 block

+    array_transpose_8x8(col + j, in);

+    array_transpose_8x8(col + j + 32, in + 8);

+    array_transpose_8x8(col + j + 64, in + 16);

+    array_transpose_8x8(col + j + 96, in + 24);

+    IDCT32

+    // 2_D: Calculate the results and store them to destination.

+    in[0] = _mm_add_epi16(stp1_0, stp1_31);

+    in[1] = _mm_add_epi16(stp1_1, stp1_30);

+    in[2] = _mm_add_epi16(stp1_2, stp1_29);

+    in[3] = _mm_add_epi16(stp1_3, stp1_28);

+    in[4] = _mm_add_epi16(stp1_4, stp1_27);

+    in[5] = _mm_add_epi16(stp1_5, stp1_26);

+    in[6] = _mm_add_epi16(stp1_6, stp1_25);

+    in[7] = _mm_add_epi16(stp1_7, stp1_24);

+    in[8] = _mm_add_epi16(stp1_8, stp1_23);

+    in[9] = _mm_add_epi16(stp1_9, stp1_22);

+    in[10] = _mm_add_epi16(stp1_10, stp1_21);

+    in[11] = _mm_add_epi16(stp1_11, stp1_20);

+    in[12] = _mm_add_epi16(stp1_12, stp1_19);

+    in[13] = _mm_add_epi16(stp1_13, stp1_18);

+    in[14] = _mm_add_epi16(stp1_14, stp1_17);

+    in[15] = _mm_add_epi16(stp1_15, stp1_16);

+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);

+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);

+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);

+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);

+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);

+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);

+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);

+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);

+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);

+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);

+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);

+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);

+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);

+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);

+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);

+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);

+    for (j = 0; j < 32; ++j) {

+      // Final rounding and shift

+      in[j] = _mm_adds_epi16(in[j], final_rounding);

+      in[j] = _mm_srai_epi16(in[j], 6);

+      RECON_AND_STORE(dest + j * stride, in[j]);

+    }

+    dest += 8;

+  }

+}

+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

+  __m128i dc_value;

+  const __m128i zero = _mm_setzero_si128();

+  int a, i;

+  a = dct_const_round_shift(input[0] * cospi_16_64);

+  a = dct_const_round_shift(a * cospi_16_64);

+  a = ROUND_POWER_OF_TWO(a, 6);

+  dc_value = _mm_set1_epi16(a);

+  for (i = 0; i < 4; ++i) {

+    int j;

+    for (j = 0; j < 32; ++j) {

+      RECON_AND_STORE(dest + j * stride, dc_value);

+    }

+    dest += 8;

+  }

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {

+  __m128i ubounded, retval;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);

+  ubounded = _mm_cmpgt_epi16(value, max);

+  retval = _mm_andnot_si128(ubounded, value);

+  ubounded = _mm_and_si128(ubounded, max);

+  retval = _mm_or_si128(retval, ubounded);

+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));

+  return retval;

+}

+void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,

+                                    int stride, int bd) {

+  tran_low_t out[4 * 4];

+  tran_low_t *outptr = out;

+  int i, j;

+  __m128i inptr[4];

+  __m128i sign_bits[2];

+  __m128i temp_mm, min_input, max_input;

+  int test;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  int optimised_cols = 0;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i eight = _mm_set1_epi16(8);

+  const __m128i max = _mm_set1_epi16(12043);

+  const __m128i min = _mm_set1_epi16(-12043);

+  // Load input into __m128i

+  inptr[0] = _mm_loadu_si128((const __m128i *)input);

+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));

+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));

+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));

+  // Pack to 16 bits

+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);

+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp_mm = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp_mm);

+  if (!test) {

+    // Do the row transform

+    idct4_sse2(inptr);

+    // Check the min & max values

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp_mm = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp_mm);

+    if (test) {

+      transpose_4x4(inptr);

+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);

+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);

+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);

+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);

+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);

+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);

+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);

+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);

+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);

+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vp9_highbd_idct4_c(input, outptr, bd);

+      input += 4;

+      outptr += 4;

+    }

+  }

+  if (optimised_cols) {

+    idct4_sse2(inptr);

+    // Final round and shift

+    inptr[0] = _mm_add_epi16(inptr[0], eight);

+    inptr[1] = _mm_add_epi16(inptr[1], eight);

+    inptr[0] = _mm_srai_epi16(inptr[0], 4);

+    inptr[1] = _mm_srai_epi16(inptr[1], 4);

+    // Reconstruction and Store

+    {

+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);

+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));

+      d0 = _mm_unpacklo_epi64(

+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));

+      d2 = _mm_unpacklo_epi64(

+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));

+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);

+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);

+      // store input0

+      _mm_storel_epi64((__m128i *)dest, d0);

+      // store input1

+      d0 = _mm_srli_si128(d0, 8);

+      _mm_storel_epi64((__m128i *)(dest + stride), d0);

+      // store input2

+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);

+      // store input3

+      d2 = _mm_srli_si128(d2, 8);

+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[4], temp_out[4];

+    // Columns

+    for (i = 0; i < 4; ++i) {

+      for (j = 0; j < 4; ++j)

+        temp_in[j] = out[j * 4 + i];

+      vp9_highbd_idct4_c(temp_in, temp_out, bd);

+      for (j = 0; j < 4; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

+      }

+    }

+  }

+}

+void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,

+                                    int stride, int bd) {

+  tran_low_t out[8 * 8];

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[8];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i sixteen = _mm_set1_epi16(16);

+  const __m128i max = _mm_set1_epi16(6201);

+  const __m128i min = _mm_set1_epi16(-6201);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 8; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 8; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct8_sse2(inptr);

+    // Find the min & max for the column transform

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 8; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      array_transpose_8x8(inptr, inptr);

+      for (i = 0; i < 8; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 8; ++i) {

+      vp9_highbd_idct8_c(input, outptr, bd);

+      input += 8;

+      outptr += 8;

+    }

+  }

+  if (optimised_cols) {

+    idct8_sse2(inptr);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[8];

+      for (i = 0; i < 8; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

+        inptr[i] = _mm_srai_epi16(inptr[i], 5);

+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[8], temp_out[8];

+    for (i = 0; i < 8; ++i) {

+      for (j = 0; j < 8; ++j)

+        temp_in[j] = out[j * 8 + i];

+      vp9_highbd_idct8_c(temp_in, temp_out, bd);

+      for (j = 0; j < 8; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+      }

+    }

+  }

+}

+void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,

+                                    int stride, int bd) {

+  tran_low_t out[8 * 8] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[8];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i sixteen = _mm_set1_epi16(16);

+  const __m128i max = _mm_set1_epi16(6201);

+  const __m128i min = _mm_set1_epi16(-6201);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 8; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  // only first 4 row has non-zero coefs

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 4; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct8_sse2(inptr);

+    // Find the min & max for the column transform

+    // N.B. Only first 4 cols contain non-zero coeffs

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 8; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      // Use fact only first 4 rows contain non-zero coeffs

+      array_transpose_4X8(inptr, inptr);

+      for (i = 0; i < 4; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vp9_highbd_idct8_c(input, outptr, bd);

+      input += 8;

+      outptr += 8;

+    }

+  }

+  if (optimised_cols) {

+    idct8_sse2(inptr);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[8];

+      for (i = 0; i < 8; i++) {

+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);

+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

+        inptr[i] = _mm_srai_epi16(inptr[i], 5);

+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[8], temp_out[8];

+    for (i = 0; i < 8; ++i) {

+      for (j = 0; j < 8; ++j)

+        temp_in[j] = out[j * 8 + i];

+      vp9_highbd_idct8_c(temp_in, temp_out, bd);

+      for (j = 0; j < 8; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

+      }

+    }

+  }

+}

+void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,

+                                       int stride, int bd) {

+  tran_low_t out[16 * 16];

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[32];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i rounding = _mm_set1_epi16(32);

+  const __m128i max = _mm_set1_epi16(3155);

+  const __m128i min = _mm_set1_epi16(-3155);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 16; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 32; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform

+    idct16_sse2(inptr, inptr + 16);

+    // Find the min & max for the column transform

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 32; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      array_transpose_16x16(inptr, inptr + 16);

+      for (i = 0; i < 16; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 16; ++i) {

+      vp9_highbd_idct16_c(input, outptr, bd);

+      input += 16;

+      outptr += 16;

+    }

+  }

+  if (optimised_cols) {

+    idct16_sse2(inptr, inptr + 16);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[2];

+      for (i = 0; i < 16; i++) {

+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);

+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);

+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));

+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);

+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);

+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);

+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);

+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[16], temp_out[16];

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j)

+        temp_in[j] = out[j * 16 + i];

+      vp9_highbd_idct16_c(temp_in, temp_out, bd);

+      for (j = 0; j < 16; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+      }

+    }

+  }

+}

+void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,

+                                      int stride, int bd) {

+  tran_low_t out[16 * 16] = { 0 };

+  tran_low_t *outptr = out;

+  int i, j, test;

+  __m128i inptr[32];

+  __m128i min_input, max_input, temp1, temp2, sign_bits;

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i rounding = _mm_set1_epi16(32);

+  const __m128i max = _mm_set1_epi16(3155);

+  const __m128i min = _mm_set1_epi16(-3155);

+  int optimised_cols = 0;

+  // Load input into __m128i & pack to 16 bits

+  for (i = 0; i < 16; i++) {

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));

+    inptr[i] = _mm_packs_epi32(temp1, temp2);

+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));

+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));

+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);

+  }

+  // Find the min & max for the row transform

+  // Since all non-zero dct coefficients are in upper-left 4x4 area,

+  // we only need to consider first 4 rows here.

+  max_input = _mm_max_epi16(inptr[0], inptr[1]);

+  min_input = _mm_min_epi16(inptr[0], inptr[1]);

+  for (i = 2; i < 4; i++) {

+    max_input = _mm_max_epi16(max_input, inptr[i]);

+    min_input = _mm_min_epi16(min_input, inptr[i]);

+  }

+  max_input = _mm_cmpgt_epi16(max_input, max);

+  min_input = _mm_cmplt_epi16(min_input, min);

+  temp1 = _mm_or_si128(max_input, min_input);

+  test = _mm_movemask_epi8(temp1);

+  if (!test) {

+    // Do the row transform (N.B. This transposes inptr)

+    idct16_sse2(inptr, inptr + 16);

+    // Find the min & max for the column transform

+    // N.B. Only first 4 cols contain non-zero coeffs

+    max_input = _mm_max_epi16(inptr[0], inptr[1]);

+    min_input = _mm_min_epi16(inptr[0], inptr[1]);

+    for (i = 2; i < 16; i++) {

+      max_input = _mm_max_epi16(max_input, inptr[i]);

+      min_input = _mm_min_epi16(min_input, inptr[i]);

+    }

+    max_input = _mm_cmpgt_epi16(max_input, max);

+    min_input = _mm_cmplt_epi16(min_input, min);

+    temp1 = _mm_or_si128(max_input, min_input);

+    test = _mm_movemask_epi8(temp1);

+    if (test) {

+      // Use fact only first 4 rows contain non-zero coeffs

+      array_transpose_8x8(inptr, inptr);

+      array_transpose_8x8(inptr + 8, inptr + 16);

+      for (i = 0; i < 4; i++) {

+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);

+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);

+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);

+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);

+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);

+      }

+    } else {

+      // Set to use the optimised transform for the column

+      optimised_cols = 1;

+    }

+  } else {

+    // Run the un-optimised row transform

+    for (i = 0; i < 4; ++i) {

+      vp9_highbd_idct16_c(input, outptr, bd);

+      input += 16;

+      outptr += 16;

+    }

+  }

+  if (optimised_cols) {

+    idct16_sse2(inptr, inptr + 16);

+    // Final round & shift and Reconstruction and Store

+    {

+      __m128i d[2];

+      for (i = 0; i < 16; i++) {

+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);

+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);

+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));

+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));

+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);

+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);

+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);

+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);

+        // Store

+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);

+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);

+      }

+    }

+  } else {

+    // Run the un-optimised column transform

+    tran_low_t temp_in[16], temp_out[16];

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j)

+        temp_in[j] = out[j * 16 + i];

+      vp9_highbd_idct16_c(temp_in, temp_out, bd);

+      for (j = 0; j < 16; ++j) {

+        dest[j * stride + i] = highbd_clip_pixel_add(

+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

+      }

+    }

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

--- /dev/null

+++ b/vpx_dsp/x86/inv_txfm_sse2.h

@@ -1,0 +1,184 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_

+#define VPX_DSP_X86_INV_TXFM_SSE2_H_

+#include <emmintrin.h>  // SSE2

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/inv_txfm.h"

+// perform 8x8 transpose

+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {

+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);

+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);

+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);

+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);

+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);

+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);

+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);

+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);

+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);

+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);

+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);

+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);

+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);

+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);

+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);

+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);

+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);

+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);

+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);

+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);

+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);

+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);

+}

+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \

+  {                                                     \

+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

+                                                        \

+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \

+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \

+  }

+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {

+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);

+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);

+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);

+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);

+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);

+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);

+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);

+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);

+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);

+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);

+}

+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {

+  __m128i tbuf[8];

+  array_transpose_8x8(res0, res0);

+  array_transpose_8x8(res1, tbuf);

+  array_transpose_8x8(res0 + 8, res1);

+  array_transpose_8x8(res1 + 8, res1 + 8);

+  res0[8] = tbuf[0];

+  res0[9] = tbuf[1];

+  res0[10] = tbuf[2];

+  res0[11] = tbuf[3];

+  res0[12] = tbuf[4];

+  res0[13] = tbuf[5];

+  res0[14] = tbuf[6];

+  res0[15] = tbuf[7];

+}

+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {

+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));

+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));

+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));

+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));

+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));

+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));

+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));

+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));

+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));

+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));

+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));

+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));

+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));

+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));

+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));

+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));

+}

+#define RECON_AND_STORE(dest, in_x) \

+  {                                                     \

+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \

+      d0 = _mm_unpacklo_epi8(d0, zero); \

+      d0 = _mm_add_epi16(in_x, d0); \

+      d0 = _mm_packus_epi16(d0, d0); \

+      _mm_storel_epi64((__m128i *)(dest), d0); \

+  }

+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {

+  const __m128i final_rounding = _mm_set1_epi16(1<<5);

+  const __m128i zero = _mm_setzero_si128();

+  // Final rounding and shift

+  in[0] = _mm_adds_epi16(in[0], final_rounding);

+  in[1] = _mm_adds_epi16(in[1], final_rounding);

+  in[2] = _mm_adds_epi16(in[2], final_rounding);

+  in[3] = _mm_adds_epi16(in[3], final_rounding);

+  in[4] = _mm_adds_epi16(in[4], final_rounding);

+  in[5] = _mm_adds_epi16(in[5], final_rounding);

+  in[6] = _mm_adds_epi16(in[6], final_rounding);

+  in[7] = _mm_adds_epi16(in[7], final_rounding);

+  in[8] = _mm_adds_epi16(in[8], final_rounding);

+  in[9] = _mm_adds_epi16(in[9], final_rounding);

+  in[10] = _mm_adds_epi16(in[10], final_rounding);

+  in[11] = _mm_adds_epi16(in[11], final_rounding);

+  in[12] = _mm_adds_epi16(in[12], final_rounding);

+  in[13] = _mm_adds_epi16(in[13], final_rounding);

+  in[14] = _mm_adds_epi16(in[14], final_rounding);

+  in[15] = _mm_adds_epi16(in[15], final_rounding);

+  in[0] = _mm_srai_epi16(in[0], 6);

+  in[1] = _mm_srai_epi16(in[1], 6);

+  in[2] = _mm_srai_epi16(in[2], 6);

+  in[3] = _mm_srai_epi16(in[3], 6);

+  in[4] = _mm_srai_epi16(in[4], 6);

+  in[5] = _mm_srai_epi16(in[5], 6);

+  in[6] = _mm_srai_epi16(in[6], 6);

+  in[7] = _mm_srai_epi16(in[7], 6);

+  in[8] = _mm_srai_epi16(in[8], 6);

+  in[9] = _mm_srai_epi16(in[9], 6);

+  in[10] = _mm_srai_epi16(in[10], 6);

+  in[11] = _mm_srai_epi16(in[11], 6);

+  in[12] = _mm_srai_epi16(in[12], 6);

+  in[13] = _mm_srai_epi16(in[13], 6);

+  in[14] = _mm_srai_epi16(in[14], 6);

+  in[15] = _mm_srai_epi16(in[15], 6);

+  RECON_AND_STORE(dest +  0 * stride, in[0]);

+  RECON_AND_STORE(dest +  1 * stride, in[1]);

+  RECON_AND_STORE(dest +  2 * stride, in[2]);

+  RECON_AND_STORE(dest +  3 * stride, in[3]);

+  RECON_AND_STORE(dest +  4 * stride, in[4]);

+  RECON_AND_STORE(dest +  5 * stride, in[5]);

+  RECON_AND_STORE(dest +  6 * stride, in[6]);

+  RECON_AND_STORE(dest +  7 * stride, in[7]);

+  RECON_AND_STORE(dest +  8 * stride, in[8]);

+  RECON_AND_STORE(dest +  9 * stride, in[9]);

+  RECON_AND_STORE(dest + 10 * stride, in[10]);

+  RECON_AND_STORE(dest + 11 * stride, in[11]);

+  RECON_AND_STORE(dest + 12 * stride, in[12]);

+  RECON_AND_STORE(dest + 13 * stride, in[13]);

+  RECON_AND_STORE(dest + 14 * stride, in[14]);

+  RECON_AND_STORE(dest + 15 * stride, in[15]);

+}

+void idct4_sse2(__m128i *in);

+void idct8_sse2(__m128i *in);

+void idct16_sse2(__m128i *in0, __m128i *in1);

+void iadst4_sse2(__m128i *in);

+void iadst8_sse2(__m128i *in);

+void iadst16_sse2(__m128i *in0, __m128i *in1);

+#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_

--- /dev/null

+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm

@@ -1,0 +1,300 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+; This file provides SSSE3 version of the inverse transformation. Part

+; of the functions are originally derived from the ffmpeg project.

+; Note that the current version applies to x86 64-bit only.

+SECTION_RODATA

+pw_11585x2: times 8 dw 23170

+pd_8192:    times 4 dd 8192

+pw_16:      times 8 dw 16

+%macro TRANSFORM_COEFFS 2

+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2

+pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1

+%endmacro

+TRANSFORM_COEFFS    6270, 15137

+TRANSFORM_COEFFS    3196, 16069

+TRANSFORM_COEFFS   13623,  9102

+%macro PAIR_PP_COEFFS 2

+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2

+%endmacro

+%macro PAIR_MP_COEFFS 2

+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2

+%endmacro

+%macro PAIR_MM_COEFFS 2

+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2

+%endmacro

+PAIR_PP_COEFFS     30274, 12540

+PAIR_PP_COEFFS      6392, 32138

+PAIR_MP_COEFFS     18204, 27246

+PAIR_PP_COEFFS     12540, 12540

+PAIR_PP_COEFFS     30274, 30274

+PAIR_PP_COEFFS      6392,  6392

+PAIR_PP_COEFFS     32138, 32138

+PAIR_MM_COEFFS     18204, 18204

+PAIR_PP_COEFFS     27246, 27246

+SECTION .text

+%if ARCH_X86_64

+%macro SUM_SUB 3

+  psubw  m%3, m%1, m%2

+  paddw  m%1, m%2

+  SWAP    %2, %3

+%endmacro

+; butterfly operation

+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2

+  pmaddwd            m%1, m%3, %5

+  pmaddwd            m%2, m%3, %6

+  paddd              m%1,  %4

+  paddd              m%2,  %4

+  psrad              m%1,  14

+  psrad              m%2,  14

+%endmacro

+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2

+  punpckhwd          m%6, m%2, m%1

+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]

+  punpcklwd          m%2, m%1

+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]

+  packssdw           m%1, m%7

+  packssdw           m%2, m%6

+%endmacro

+; matrix transpose

+%macro INTERLEAVE_2X 4

+  punpckh%1          m%4, m%2, m%3

+  punpckl%1          m%2, m%3

+  SWAP               %3,  %4

+%endmacro

+%macro TRANSPOSE8X8 9

+  INTERLEAVE_2X  wd, %1, %2, %9

+  INTERLEAVE_2X  wd, %3, %4, %9

+  INTERLEAVE_2X  wd, %5, %6, %9

+  INTERLEAVE_2X  wd, %7, %8, %9

+  INTERLEAVE_2X  dq, %1, %3, %9

+  INTERLEAVE_2X  dq, %2, %4, %9

+  INTERLEAVE_2X  dq, %5, %7, %9

+  INTERLEAVE_2X  dq, %6, %8, %9

+  INTERLEAVE_2X  qdq, %1, %5, %9

+  INTERLEAVE_2X  qdq, %3, %7, %9

+  INTERLEAVE_2X  qdq, %2, %6, %9

+  INTERLEAVE_2X  qdq, %4, %8, %9

+  SWAP  %2, %5

+  SWAP  %4, %7

+%endmacro

+%macro IDCT8_1D 0

+  SUM_SUB          0,    4,    9

+  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10

+  pmulhrsw        m0,  m12

+  pmulhrsw        m4,  m12

+  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10

+  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10

+  SUM_SUB          1,    5,    9

+  SUM_SUB          7,    3,    9

+  SUM_SUB          0,    6,    9

+  SUM_SUB          4,    2,    9

+  SUM_SUB          3,    5,    9

+  pmulhrsw        m3,  m12

+  pmulhrsw        m5,  m12

+  SUM_SUB          0,    7,    9

+  SUM_SUB          4,    3,    9

+  SUM_SUB          2,    5,    9

+  SUM_SUB          6,    1,    9

+  SWAP             3,    6

+  SWAP             1,    4

+%endmacro

+; This macro handles 8 pixels per line

+%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero

+  paddw           m%1, m11

+  paddw           m%2, m11

+  psraw           m%1, 5

+  psraw           m%2, 5

+  movh            m%3, [outputq]

+  movh            m%4, [outputq + strideq]

+  punpcklbw       m%3, m%5

+  punpcklbw       m%4, m%5

+  paddw           m%3, m%1

+  paddw           m%4, m%2

+  packuswb        m%3, m%5

+  packuswb        m%4, m%5

+  movh               [outputq], m%3

+  movh     [outputq + strideq], m%4

+%endmacro

+INIT_XMM ssse3

+; full inverse 8x8 2D-DCT transform

+cglobal idct8x8_64_add, 3, 5, 13, input, output, stride

+  mova     m8, [pd_8192]

+  mova    m11, [pw_16]

+  mova    m12, [pw_11585x2]

+  lea      r3, [2 * strideq]

+  mova     m0, [inputq +   0]

+  mova     m1, [inputq +  16]

+  mova     m2, [inputq +  32]

+  mova     m3, [inputq +  48]

+  mova     m4, [inputq +  64]

+  mova     m5, [inputq +  80]

+  mova     m6, [inputq +  96]

+  mova     m7, [inputq + 112]

+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

+  IDCT8_1D

+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

+  IDCT8_1D

+  pxor    m12, m12

+  ADD_STORE_8P_2X  0, 1, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  2, 3, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  4, 5, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  6, 7, 9, 10, 12

+  RET

+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero

+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride

+  mova       m8, [pd_8192]

+  mova      m11, [pw_16]

+  mova      m12, [pw_11585x2]

+  lea        r3, [2 * strideq]

+  mova       m0, [inputq +  0]

+  mova       m1, [inputq + 16]

+  mova       m2, [inputq + 32]

+  mova       m3, [inputq + 48]

+  punpcklwd  m0, m1

+  punpcklwd  m2, m3

+  punpckhdq  m9, m0, m2

+  punpckldq  m0, m2

+  SWAP       2, 9

+  ; m0 -> [0], [0]

+  ; m1 -> [1], [1]

+  ; m2 -> [2], [2]

+  ; m3 -> [3], [3]

+  punpckhqdq m10, m0, m0

+  punpcklqdq m0,  m0

+  punpckhqdq m9,  m2, m2

+  punpcklqdq m2,  m2

+  SWAP       1, 10

+  SWAP       3,  9

+  pmulhrsw   m0, m12

+  pmulhrsw   m2, [dpw_30274_12540]

+  pmulhrsw   m1, [dpw_6392_32138]

+  pmulhrsw   m3, [dpw_m18204_27246]

+  SUM_SUB    0, 2, 9

+  SUM_SUB    1, 3, 9

+  punpcklqdq m9, m3, m3

+  punpckhqdq m5, m3, m9

+  SUM_SUB    3, 5, 9

+  punpckhqdq m5, m3

+  pmulhrsw   m5, m12

+  punpckhqdq m9, m1, m5

+  punpcklqdq m1, m5

+  SWAP       5, 9

+  SUM_SUB    0, 5, 9

+  SUM_SUB    2, 1, 9

+  punpckhqdq m3, m0, m0

+  punpckhqdq m4, m1, m1

+  punpckhqdq m6, m5, m5

+  punpckhqdq m7, m2, m2

+  punpcklwd  m0, m3

+  punpcklwd  m7, m2

+  punpcklwd  m1, m4

+  punpcklwd  m6, m5

+  punpckhdq  m4, m0, m7

+  punpckldq  m0, m7

+  punpckhdq  m10, m1, m6

+  punpckldq  m5, m1, m6

+  punpckhqdq m1, m0, m5

+  punpcklqdq m0, m5

+  punpckhqdq m3, m4, m10

+  punpcklqdq m2, m4, m10

+  pmulhrsw   m0, m12

+  pmulhrsw   m6, m2, [dpw_30274_30274]

+  pmulhrsw   m4, m2, [dpw_12540_12540]

+  pmulhrsw   m7, m1, [dpw_32138_32138]

+  pmulhrsw   m1, [dpw_6392_6392]

+  pmulhrsw   m5, m3, [dpw_m18204_m18204]

+  pmulhrsw   m3, [dpw_27246_27246]

+  mova       m2, m0

+  SUM_SUB    0, 6, 9

+  SUM_SUB    2, 4, 9

+  SUM_SUB    1, 5, 9

+  SUM_SUB    7, 3, 9

+  SUM_SUB    3, 5, 9

+  pmulhrsw   m3, m12

+  pmulhrsw   m5, m12

+  SUM_SUB    0, 7, 9

+  SUM_SUB    2, 3, 9

+  SUM_SUB    4, 5, 9

+  SUM_SUB    6, 1, 9

+  SWAP       3, 6

+  SWAP       1, 2

+  SWAP       2, 4

+  pxor    m12, m12

+  ADD_STORE_8P_2X  0, 1, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  2, 3, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  4, 5, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  6, 7, 9, 10, 12

+  RET

+%endif