ref: 5d0a271ded78c1fe86ca583bef29d16c80d4feba
dir: /vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm/
; ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "third_party/x86inc/x86inc.asm" ; This file provides SSSE3 version of the inverse transformation. Part ; of the functions are originally derived from the ffmpeg project. ; Note that the current version applies to x86 64-bit only. SECTION_RODATA pw_11585x2: times 8 dw 23170 pw_m2404x2: times 8 dw -2404*2 pw_m4756x2: times 8 dw -4756*2 pw_m5520x2: times 8 dw -5520*2 pw_m8423x2: times 8 dw -8423*2 pw_m9102x2: times 8 dw -9102*2 pw_m10394x2: times 8 dw -10394*2 pw_m11003x2: times 8 dw -11003*2 pw_16364x2: times 8 dw 16364*2 pw_16305x2: times 8 dw 16305*2 pw_16207x2: times 8 dw 16207*2 pw_16069x2: times 8 dw 16069*2 pw_15893x2: times 8 dw 15893*2 pw_15679x2: times 8 dw 15679*2 pw_15426x2: times 8 dw 15426*2 pw_15137x2: times 8 dw 15137*2 pw_14811x2: times 8 dw 14811*2 pw_14449x2: times 8 dw 14449*2 pw_14053x2: times 8 dw 14053*2 pw_13623x2: times 8 dw 13623*2 pw_13160x2: times 8 dw 13160*2 pw_12665x2: times 8 dw 12665*2 pw_12140x2: times 8 dw 12140*2 pw__9760x2: times 8 dw 9760*2 pw__7723x2: times 8 dw 7723*2 pw__7005x2: times 8 dw 7005*2 pw__6270x2: times 8 dw 6270*2 pw__3981x2: times 8 dw 3981*2 pw__3196x2: times 8 dw 3196*2 pw__1606x2: times 8 dw 1606*2 pw___804x2: times 8 dw 804*2 pd_8192: times 4 dd 8192 pw_32: times 8 dw 32 pw_16: times 8 dw 16 %macro TRANSFORM_COEFFS 2 pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2 %endmacro TRANSFORM_COEFFS 6270, 15137 TRANSFORM_COEFFS 3196, 16069 TRANSFORM_COEFFS 13623, 9102 ; constants for 32x32_34 TRANSFORM_COEFFS 804, 16364 TRANSFORM_COEFFS 15426, 5520 TRANSFORM_COEFFS 3981, 15893 TRANSFORM_COEFFS 16207, 2404 TRANSFORM_COEFFS 1606, 16305 TRANSFORM_COEFFS 15679, 4756 TRANSFORM_COEFFS 11585, 11585 ; constants for 32x32_1024 TRANSFORM_COEFFS 12140, 11003 TRANSFORM_COEFFS 7005, 14811 TRANSFORM_COEFFS 14053, 8423 TRANSFORM_COEFFS 9760, 13160 TRANSFORM_COEFFS 12665, 10394 TRANSFORM_COEFFS 7723, 14449 %macro PAIR_PP_COEFFS 2 dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 %endmacro %macro PAIR_MP_COEFFS 2 dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 %endmacro %macro PAIR_MM_COEFFS 2 dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 %endmacro PAIR_PP_COEFFS 30274, 12540 PAIR_PP_COEFFS 6392, 32138 PAIR_MP_COEFFS 18204, 27246 PAIR_PP_COEFFS 12540, 12540 PAIR_PP_COEFFS 30274, 30274 PAIR_PP_COEFFS 6392, 6392 PAIR_PP_COEFFS 32138, 32138 PAIR_MM_COEFFS 18204, 18204 PAIR_PP_COEFFS 27246, 27246 SECTION .text %if ARCH_X86_64 %macro SUM_SUB 3 psubw m%3, m%1, m%2 paddw m%1, m%2 SWAP %2, %3 %endmacro ; butterfly operation %macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 pmaddwd m%1, m%3, %5 pmaddwd m%2, m%3, %6 paddd m%1, %4 paddd m%2, %4 psrad m%1, 14 psrad m%2, 14 %endmacro %macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 punpckhwd m%6, m%2, m%1 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] punpcklwd m%2, m%1 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] packssdw m%1, m%7 packssdw m%2, m%6 %endmacro %macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 punpckhwd m%6, m%2, m%1 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4] punpcklwd m%2, m%1 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4] packssdw m%1, m%7 packssdw m%2, m%6 %endmacro ; matrix transpose %macro INTERLEAVE_2X 4 punpckh%1 m%4, m%2, m%3 punpckl%1 m%2, m%3 SWAP %3, %4 %endmacro %macro TRANSPOSE8X8 9 INTERLEAVE_2X wd, %1, %2, %9 INTERLEAVE_2X wd, %3, %4, %9 INTERLEAVE_2X wd, %5, %6, %9 INTERLEAVE_2X wd, %7, %8, %9 INTERLEAVE_2X dq, %1, %3, %9 INTERLEAVE_2X dq, %2, %4, %9 INTERLEAVE_2X dq, %5, %7, %9 INTERLEAVE_2X dq, %6, %8, %9 INTERLEAVE_2X qdq, %1, %5, %9 INTERLEAVE_2X qdq, %3, %7, %9 INTERLEAVE_2X qdq, %2, %6, %9 INTERLEAVE_2X qdq, %4, %8, %9 SWAP %2, %5 SWAP %4, %7 %endmacro %macro IDCT8_1D 0 SUM_SUB 0, 4, 9 BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 pmulhrsw m0, m12 pmulhrsw m4, m12 BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 SUM_SUB 1, 5, 9 SUM_SUB 7, 3, 9 SUM_SUB 0, 6, 9 SUM_SUB 4, 2, 9 SUM_SUB 3, 5, 9 pmulhrsw m3, m12 pmulhrsw m5, m12 SUM_SUB 0, 7, 9 SUM_SUB 4, 3, 9 SUM_SUB 2, 5, 9 SUM_SUB 6, 1, 9 SWAP 3, 6 SWAP 1, 4 %endmacro ; This macro handles 8 pixels per line %macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero paddw m%1, m11 paddw m%2, m11 psraw m%1, 5 psraw m%2, 5 movh m%3, [outputq] movh m%4, [outputq + strideq] punpcklbw m%3, m%5 punpcklbw m%4, m%5 paddw m%3, m%1 paddw m%4, m%2 packuswb m%3, m%5 packuswb m%4, m%5 movh [outputq], m%3 movh [outputq + strideq], m%4 %endmacro INIT_XMM ssse3 ; full inverse 8x8 2D-DCT transform cglobal idct8x8_64_add, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m11, [pw_16] mova m12, [pw_11585x2] lea r3, [2 * strideq] %if CONFIG_VP9_HIGHBITDEPTH mova m0, [inputq + 0] packssdw m0, [inputq + 16] mova m1, [inputq + 32] packssdw m1, [inputq + 48] mova m2, [inputq + 64] packssdw m2, [inputq + 80] mova m3, [inputq + 96] packssdw m3, [inputq + 112] mova m4, [inputq + 128] packssdw m4, [inputq + 144] mova m5, [inputq + 160] packssdw m5, [inputq + 176] mova m6, [inputq + 192] packssdw m6, [inputq + 208] mova m7, [inputq + 224] packssdw m7, [inputq + 240] %else mova m0, [inputq + 0] mova m1, [inputq + 16] mova m2, [inputq + 32] mova m3, [inputq + 48] mova m4, [inputq + 64] mova m5, [inputq + 80] mova m6, [inputq + 96] mova m7, [inputq + 112] %endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 IDCT8_1D TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 IDCT8_1D pxor m12, m12 ADD_STORE_8P_2X 0, 1, 9, 10, 12 lea outputq, [outputq + r3] ADD_STORE_8P_2X 2, 3, 9, 10, 12 lea outputq, [outputq + r3] ADD_STORE_8P_2X 4, 5, 9, 10, 12 lea outputq, [outputq + r3] ADD_STORE_8P_2X 6, 7, 9, 10, 12 RET ; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero cglobal idct8x8_12_add, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m11, [pw_16] mova m12, [pw_11585x2] lea r3, [2 * strideq] %if CONFIG_VP9_HIGHBITDEPTH mova m0, [inputq + 0] packssdw m0, [inputq + 16] mova m1, [inputq + 32] packssdw m1, [inputq + 48] mova m2, [inputq + 64] packssdw m2, [inputq + 80] mova m3, [inputq + 96] packssdw m3, [inputq + 112] %else mova m0, [inputq + 0] mova m1, [inputq + 16] mova m2, [inputq + 32] mova m3, [inputq + 48] %endif punpcklwd m0, m1 punpcklwd m2, m3 punpckhdq m9, m0, m2 punpckldq m0, m2 SWAP 2, 9 ; m0 -> [0], [0] ; m1 -> [1], [1] ; m2 -> [2], [2] ; m3 -> [3], [3] punpckhqdq m10, m0, m0 punpcklqdq m0, m0 punpckhqdq m9, m2, m2 punpcklqdq m2, m2 SWAP 1, 10 SWAP 3, 9 pmulhrsw m0, m12 pmulhrsw m2, [dpw_30274_12540] pmulhrsw m1, [dpw_6392_32138] pmulhrsw m3, [dpw_m18204_27246] SUM_SUB 0, 2, 9 SUM_SUB 1, 3, 9 punpcklqdq m9, m3, m3 punpckhqdq m5, m3, m9 SUM_SUB 3, 5, 9 punpckhqdq m5, m3 pmulhrsw m5, m12 punpckhqdq m9, m1, m5 punpcklqdq m1, m5 SWAP 5, 9 SUM_SUB 0, 5, 9 SUM_SUB 2, 1, 9 punpckhqdq m3, m0, m0 punpckhqdq m4, m1, m1 punpckhqdq m6, m5, m5 punpckhqdq m7, m2, m2 punpcklwd m0, m3 punpcklwd m7, m2 punpcklwd m1, m4 punpcklwd m6, m5 punpckhdq m4, m0, m7 punpckldq m0, m7 punpckhdq m10, m1, m6 punpckldq m5, m1, m6 punpckhqdq m1, m0, m5 punpcklqdq m0, m5 punpckhqdq m3, m4, m10 punpcklqdq m2, m4, m10 pmulhrsw m0, m12 pmulhrsw m6, m2, [dpw_30274_30274] pmulhrsw m4, m2, [dpw_12540_12540] pmulhrsw m7, m1, [dpw_32138_32138] pmulhrsw m1, [dpw_6392_6392] pmulhrsw m5, m3, [dpw_m18204_m18204] pmulhrsw m3, [dpw_27246_27246] mova m2, m0 SUM_SUB 0, 6, 9 SUM_SUB 2, 4, 9 SUM_SUB 1, 5, 9 SUM_SUB 7, 3, 9 SUM_SUB 3, 5, 9 pmulhrsw m3, m12 pmulhrsw m5, m12 SUM_SUB 0, 7, 9 SUM_SUB 2, 3, 9 SUM_SUB 4, 5, 9 SUM_SUB 6, 1, 9 SWAP 3, 6 SWAP 1, 2 SWAP 2, 4 pxor m12, m12 ADD_STORE_8P_2X 0, 1, 9, 10, 12 lea outputq, [outputq + r3] ADD_STORE_8P_2X 2, 3, 9, 10, 12 lea outputq, [outputq + r3] ADD_STORE_8P_2X 4, 5, 9, 10, 12 lea outputq, [outputq + r3] ADD_STORE_8P_2X 6, 7, 9, 10, 12 RET %define idx0 16 * 0 %define idx1 16 * 1 %define idx2 16 * 2 %define idx3 16 * 3 %define idx4 16 * 4 %define idx5 16 * 5 %define idx6 16 * 6 %define idx7 16 * 7 %define idx8 16 * 0 %define idx9 16 * 1 %define idx10 16 * 2 %define idx11 16 * 3 %define idx12 16 * 4 %define idx13 16 * 5 %define idx14 16 * 6 %define idx15 16 * 7 %define idx16 16 * 0 %define idx17 16 * 1 %define idx18 16 * 2 %define idx19 16 * 3 %define idx20 16 * 4 %define idx21 16 * 5 %define idx22 16 * 6 %define idx23 16 * 7 %define idx24 16 * 0 %define idx25 16 * 1 %define idx26 16 * 2 %define idx27 16 * 3 %define idx28 16 * 4 %define idx29 16 * 5 %define idx30 16 * 6 %define idx31 16 * 7 ; FROM idct32x32_add_neon.asm ; ; Instead of doing the transforms stage by stage, it is done by loading ; some input values and doing as many stages as possible to minimize the ; storing/loading of intermediate results. To fit within registers, the ; final coefficients are cut into four blocks: ; BLOCK A: 16-19,28-31 ; BLOCK B: 20-23,24-27 ; BLOCK C: 8-11,12-15 ; BLOCK D: 0-3,4-7 ; Blocks A and C are straight calculation through the various stages. In ; block B, further calculations are performed using the results from ; block A. In block D, further calculations are performed using the results ; from block C and then the final calculations are done using results from ; block A and B which have been combined at the end of block B. ; %macro IDCT32X32_34 4 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m11, m1 pmulhrsw m1, [pw___804x2] ; stp1_16 mova [r4 + 0], m0 pmulhrsw m11, [pw_16364x2] ; stp2_31 mova [r4 + 16 * 2], m2 mova m12, m7 pmulhrsw m7, [pw_15426x2] ; stp1_28 mova [r4 + 16 * 4], m4 pmulhrsw m12, [pw_m5520x2] ; stp2_19 mova [r4 + 16 * 6], m6 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m2, m1 ; stp1_16 mova m0, m11 ; stp1_31 mova m4, m7 ; stp1_28 mova m15, m12 ; stp1_19 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 SUM_SUB 0, 15, 9 ; stp2_17, stp2_18 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m6, m5 pmulhrsw m5, [pw__3981x2] ; stp1_20 mova [stp + %4 + idx28], m12 mova [stp + %4 + idx29], m15 pmulhrsw m6, [pw_15893x2] ; stp2_27 mova [stp + %4 + idx30], m2 mova m2, m3 pmulhrsw m3, [pw_m2404x2] ; stp1_23 mova [stp + %4 + idx31], m11 pmulhrsw m2, [pw_16207x2] ; stp2_24 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m13, m5 ; stp1_20 mova m14, m6 ; stp1_27 mova m15, m3 ; stp1_23 mova m11, m2 ; stp1_24 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 SUM_SUB 15, 14, 9 ; stp2_22, stp2_21 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 SUM_SUB 11, 13, 9 ; stp2_25, stp2_26 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 1, 3, 9 ; stp2_16, stp2_23 SUM_SUB 0, 15, 9 ; stp2_17, stp2_22 SUM_SUB 4, 14, 9 ; stp2_18, stp2_21 SUM_SUB 7, 5, 9 ; stp2_19, stp2_20 mova [stp + %3 + idx16], m1 mova [stp + %3 + idx17], m0 mova [stp + %3 + idx18], m4 mova [stp + %3 + idx19], m7 mova m4, [stp + %4 + idx28] mova m7, [stp + %4 + idx29] mova m10, [stp + %4 + idx30] mova m12, [stp + %4 + idx31] SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 SUM_SUB 10, 11, 9 ; stp2_30, stp2_25 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 mova [stp + %4 + idx28], m4 mova [stp + %4 + idx29], m7 mova [stp + %4 + idx30], m10 mova [stp + %4 + idx31], m12 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 6, 5, 9 pmulhrsw m6, m10 ; stp1_27 pmulhrsw m5, m10 ; stp1_20 SUM_SUB 13, 14, 9 pmulhrsw m13, m10 ; stp1_26 pmulhrsw m14, m10 ; stp1_21 SUM_SUB 11, 15, 9 pmulhrsw m11, m10 ; stp1_25 pmulhrsw m15, m10 ; stp1_22 SUM_SUB 2, 3, 9 pmulhrsw m2, m10 ; stp1_24 pmulhrsw m3, m10 ; stp1_23 %else BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 SWAP 6, 5 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 SWAP 13, 14 BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 SWAP 11, 15 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 SWAP 2, 3 %endif mova [stp + %4 + idx24], m2 mova [stp + %4 + idx25], m11 mova [stp + %4 + idx26], m13 mova [stp + %4 + idx27], m6 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m0, [rsp + transposed_in + 16 * 2] mova m6, [rsp + transposed_in + 16 * 6] mova m1, m0 pmulhrsw m0, [pw__1606x2] ; stp1_8 mova [stp + %3 + idx20], m5 mova [stp + %3 + idx21], m14 pmulhrsw m1, [pw_16305x2] ; stp2_15 mova [stp + %3 + idx22], m15 mova m7, m6 pmulhrsw m7, [pw_m4756x2] ; stp2_11 mova [stp + %3 + idx23], m3 pmulhrsw m6, [pw_15679x2] ; stp1_12 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m3, m0 ; stp1_8 mova m2, m1 ; stp1_15 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 mova m4, m7 ; stp1_11 mova m5, m6 ; stp1_12 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 5, 4, 9 pmulhrsw m5, m10 ; stp1_13 pmulhrsw m4, m10 ; stp1_10 SUM_SUB 6, 7, 9 pmulhrsw m6, m10 ; stp1_12 pmulhrsw m7, m10 ; stp1_11 %else BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 SWAP 5, 4 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 SWAP 6, 7 %endif ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova [stp + %2 + idx8], m0 mova [stp + %2 + idx9], m2 mova [stp + %2 + idx10], m4 mova [stp + %2 + idx11], m7 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m11, [rsp + transposed_in + 16 * 4] mova m12, m11 pmulhrsw m11, [pw__3196x2] ; stp1_4 pmulhrsw m12, [pw_16069x2] ; stp1_7 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m0, [rsp + transposed_in + 16 * 0] mova m10, [pw_11585x2] pmulhrsw m0, m10 ; stp1_1 mova m14, m11 ; stp1_4 mova m13, m12 ; stp1_7 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams SUM_SUB 13, 14, 9 pmulhrsw m13, m10 ; stp1_6 pmulhrsw m14, m10 ; stp1_5 %else BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 SWAP 13, 14 %endif mova m7, m0 ; stp1_0 = stp1_1 mova m4, m0 ; stp1_1 mova m2, m7 ; stp1_0 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 SUM_SUB 7, 13, 9 ; stp1_1, stp1_6 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 SUM_SUB 4, 11, 9 ; stp1_3, stp1_4 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 1, 9 ; stp1_0, stp1_15 SUM_SUB 7, 3, 9 ; stp1_1, stp1_14 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 SUM_SUB 4, 6, 9 ; stp1_3, stp1_12 ; 0-3, 28-31 final stage mova m15, [stp + %4 + idx30] mova m10, [stp + %4 + idx31] SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 SUM_SUB 7, 15, 9 ; stp1_1, stp1_30 mova [stp + %1 + idx0], m0 mova [stp + %1 + idx1], m7 mova [stp + %4 + idx30], m15 mova [stp + %4 + idx31], m10 mova m7, [stp + %4 + idx28] mova m0, [stp + %4 + idx29] SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 SUM_SUB 4, 7, 9 ; stp1_3, stp1_28 mova [stp + %1 + idx2], m2 mova [stp + %1 + idx3], m4 mova [stp + %4 + idx28], m7 mova [stp + %4 + idx29], m0 ; 12-15, 16-19 final stage mova m0, [stp + %3 + idx16] mova m7, [stp + %3 + idx17] mova m2, [stp + %3 + idx18] mova m4, [stp + %3 + idx19] SUM_SUB 1, 0, 9 ; stp1_15, stp1_16 SUM_SUB 3, 7, 9 ; stp1_14, stp1_17 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 SUM_SUB 6, 4, 9 ; stp1_12, stp1_19 mova [stp + %2 + idx12], m6 mova [stp + %2 + idx13], m5 mova [stp + %2 + idx14], m3 mova [stp + %2 + idx15], m1 mova [stp + %3 + idx16], m0 mova [stp + %3 + idx17], m7 mova [stp + %3 + idx18], m2 mova [stp + %3 + idx19], m4 mova m4, [stp + %2 + idx8] mova m5, [stp + %2 + idx9] mova m6, [stp + %2 + idx10] mova m7, [stp + %2 + idx11] SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 ; 4-7, 24-27 final stage mova m0, [stp + %4 + idx27] mova m1, [stp + %4 + idx26] mova m2, [stp + %4 + idx25] mova m3, [stp + %4 + idx24] SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 mova [stp + %4 + idx27], m0 mova [stp + %4 + idx26], m1 mova [stp + %4 + idx25], m2 mova [stp + %4 + idx24], m3 mova [stp + %1 + idx4], m11 mova [stp + %1 + idx5], m14 mova [stp + %1 + idx6], m13 mova [stp + %1 + idx7], m12 ; 8-11, 20-23 final stage mova m0, [stp + %3 + idx20] mova m1, [stp + %3 + idx21] mova m2, [stp + %3 + idx22] mova m3, [stp + %3 + idx23] SUM_SUB 7, 0, 9 ; stp1_11, stp_20 SUM_SUB 6, 1, 9 ; stp1_10, stp_21 SUM_SUB 5, 2, 9 ; stp1_9, stp_22 SUM_SUB 4, 3, 9 ; stp1_8, stp_23 mova [stp + %2 + idx8], m4 mova [stp + %2 + idx9], m5 mova [stp + %2 + idx10], m6 mova [stp + %2 + idx11], m7 mova [stp + %3 + idx20], m0 mova [stp + %3 + idx21], m1 mova [stp + %3 + idx22], m2 mova [stp + %3 + idx23], m3 %endmacro %macro RECON_AND_STORE 1 mova m11, [pw_32] lea stp, [rsp + %1] mov r6, 32 pxor m8, m8 %%recon_and_store: mova m0, [stp + 16 * 32 * 0] mova m1, [stp + 16 * 32 * 1] mova m2, [stp + 16 * 32 * 2] mova m3, [stp + 16 * 32 * 3] add stp, 16 paddw m0, m11 paddw m1, m11 paddw m2, m11 paddw m3, m11 psraw m0, 6 psraw m1, 6 psraw m2, 6 psraw m3, 6 movh m4, [outputq + 0] movh m5, [outputq + 8] movh m6, [outputq + 16] movh m7, [outputq + 24] punpcklbw m4, m8 punpcklbw m5, m8 punpcklbw m6, m8 punpcklbw m7, m8 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 mova [outputq + 0], m0 mova [outputq + 16], m2 lea outputq, [outputq + strideq] dec r6 jnz %%recon_and_store %endmacro %define i32x32_size 16*32*5 %define pass_two_start 16*32*0 %define transposed_in 16*32*4 %define pass_one_start 16*32*0 %define stp r8 INIT_XMM ssse3 cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride mova m8, [pd_8192] lea stp, [rsp + pass_one_start] idct32x32_34: mov r3, inputq lea r4, [rsp + transposed_in] idct32x32_34_transpose: %if CONFIG_VP9_HIGHBITDEPTH mova m0, [r3 + 0] packssdw m0, [r3 + 16] mova m1, [r3 + 32 * 4] packssdw m1, [r3 + 32 * 4 + 16] mova m2, [r3 + 32 * 8] packssdw m2, [r3 + 32 * 8 + 16] mova m3, [r3 + 32 * 12] packssdw m3, [r3 + 32 * 12 + 16] mova m4, [r3 + 32 * 16] packssdw m4, [r3 + 32 * 16 + 16] mova m5, [r3 + 32 * 20] packssdw m5, [r3 + 32 * 20 + 16] mova m6, [r3 + 32 * 24] packssdw m6, [r3 + 32 * 24 + 16] mova m7, [r3 + 32 * 28] packssdw m7, [r3 + 32 * 28 + 16] %else mova m0, [r3 + 0] mova m1, [r3 + 16 * 4] mova m2, [r3 + 16 * 8] mova m3, [r3 + 16 * 12] mova m4, [r3 + 16 * 16] mova m5, [r3 + 16 * 20] mova m6, [r3 + 16 * 24] mova m7, [r3 + 16 * 28] %endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 IDCT32X32_34 16*0, 16*32, 16*64, 16*96 lea stp, [stp + 16 * 8] mov r6, 4 lea stp, [rsp + pass_one_start] lea r9, [rsp + pass_one_start] idct32x32_34_2: lea r4, [rsp + transposed_in] mov r3, r9 idct32x32_34_transpose_2: mova m0, [r3 + 0] mova m1, [r3 + 16 * 1] mova m2, [r3 + 16 * 2] mova m3, [r3 + 16 * 3] mova m4, [r3 + 16 * 4] mova m5, [r3 + 16 * 5] mova m6, [r3 + 16 * 6] mova m7, [r3 + 16 * 7] TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 IDCT32X32_34 16*0, 16*8, 16*16, 16*24 lea stp, [stp + 16 * 32] add r9, 16 * 32 dec r6 jnz idct32x32_34_2 RECON_AND_STORE pass_two_start RET %macro IDCT32X32_135 4 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m1, [rsp + transposed_in + 16 * 1] mova m11, m1 pmulhrsw m1, [pw___804x2] ; stp1_16 pmulhrsw m11, [pw_16364x2] ; stp2_31 mova m7, [rsp + transposed_in + 16 * 7] mova m12, m7 pmulhrsw m7, [pw_15426x2] ; stp1_28 pmulhrsw m12, [pw_m5520x2] ; stp2_19 mova m3, [rsp + transposed_in + 16 * 9] mova m4, m3 pmulhrsw m3, [pw__7005x2] ; stp1_18 pmulhrsw m4, [pw_14811x2] ; stp2_29 mova m0, [rsp + transposed_in + 16 * 15] mova m2, m0 pmulhrsw m0, [pw_12140x2] ; stp1_30 pmulhrsw m2, [pw_m11003x2] ; stp2_17 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 mova [stp + %3 + idx16], m1 mova [stp + %3 + idx17], m0 mova [stp + %3 + idx18], m4 mova [stp + %3 + idx19], m7 mova [stp + %4 + idx28], m12 mova [stp + %4 + idx29], m3 mova [stp + %4 + idx30], m2 mova [stp + %4 + idx31], m11 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m2, [rsp + transposed_in + 16 * 3] mova m3, m2 pmulhrsw m3, [pw_m2404x2] ; stp1_23 pmulhrsw m2, [pw_16207x2] ; stp2_24 mova m5, [rsp + transposed_in + 16 * 5] mova m6, m5 pmulhrsw m5, [pw__3981x2] ; stp1_20 pmulhrsw m6, [pw_15893x2] ; stp2_27 mova m14, [rsp + transposed_in + 16 * 11] mova m13, m14 pmulhrsw m13, [pw_m8423x2] ; stp1_21 pmulhrsw m14, [pw_14053x2] ; stp2_26 mova m0, [rsp + transposed_in + 16 * 13] mova m1, m0 pmulhrsw m0, [pw__9760x2] ; stp1_22 pmulhrsw m1, [pw_13160x2] ; stp2_25 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m4, [stp + %3 + idx16] mova m7, [stp + %3 + idx17] mova m11, [stp + %3 + idx18] mova m12, [stp + %3 + idx19] SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 mova [stp + %3 + idx16], m4 mova [stp + %3 + idx17], m7 mova [stp + %3 + idx18], m11 mova [stp + %3 + idx19], m12 mova m4, [stp + %4 + idx28] mova m7, [stp + %4 + idx29] mova m11, [stp + %4 + idx30] mova m12, [stp + %4 + idx31] SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 mova [stp + %4 + idx28], m4 mova [stp + %4 + idx29], m7 mova [stp + %4 + idx30], m11 mova [stp + %4 + idx31], m12 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 6, 5, 9 pmulhrsw m6, m10 ; stp1_27 pmulhrsw m5, m10 ; stp1_20 SUM_SUB 13, 14, 9 pmulhrsw m13, m10 ; stp1_26 pmulhrsw m14, m10 ; stp1_21 SUM_SUB 1, 0, 9 pmulhrsw m1, m10 ; stp1_25 pmulhrsw m0, m10 ; stp1_22 SUM_SUB 2, 3, 9 pmulhrsw m2, m10 ; stp1_25 pmulhrsw m3, m10 ; stp1_22 %else BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 SWAP 6, 5 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 SWAP 13, 14 BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 SWAP 1, 0 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 SWAP 2, 3 %endif mova [stp + %3 + idx20], m5 mova [stp + %3 + idx21], m14 mova [stp + %3 + idx22], m0 mova [stp + %3 + idx23], m3 mova [stp + %4 + idx24], m2 mova [stp + %4 + idx25], m1 mova [stp + %4 + idx26], m13 mova [stp + %4 + idx27], m6 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m0, [rsp + transposed_in + 16 * 2] mova m1, m0 pmulhrsw m0, [pw__1606x2] ; stp1_8 pmulhrsw m1, [pw_16305x2] ; stp2_15 mova m6, [rsp + transposed_in + 16 * 6] mova m7, m6 pmulhrsw m7, [pw_m4756x2] ; stp2_11 pmulhrsw m6, [pw_15679x2] ; stp1_12 mova m4, [rsp + transposed_in + 16 * 10] mova m5, m4 pmulhrsw m4, [pw__7723x2] ; stp1_10 pmulhrsw m5, [pw_14449x2] ; stp2_13 mova m2, [rsp + transposed_in + 16 * 14] mova m3, m2 pmulhrsw m3, [pw_m10394x2] ; stp1_9 pmulhrsw m2, [pw_12665x2] ; stp2_14 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 5, 4, 9 pmulhrsw m5, m10 ; stp1_13 pmulhrsw m4, m10 ; stp1_10 SUM_SUB 6, 7, 9 pmulhrsw m6, m10 ; stp1_12 pmulhrsw m7, m10 ; stp1_11 %else BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 SWAP 5, 4 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 SWAP 6, 7 %endif ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova [stp + %2 + idx8], m0 mova [stp + %2 + idx9], m2 mova [stp + %2 + idx10], m4 mova [stp + %2 + idx11], m7 mova [stp + %2 + idx12], m6 mova [stp + %2 + idx13], m5 mova [stp + %2 + idx14], m3 mova [stp + %2 + idx15], m1 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m11, [rsp + transposed_in + 16 * 4] mova m12, m11 pmulhrsw m11, [pw__3196x2] ; stp1_4 pmulhrsw m12, [pw_16069x2] ; stp1_7 mova m13, [rsp + transposed_in + 16 * 12] mova m14, m13 pmulhrsw m13, [pw_13623x2] ; stp1_6 pmulhrsw m14, [pw_m9102x2] ; stp1_5 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m0, [rsp + transposed_in + 16 * 0] mova m2, [rsp + transposed_in + 16 * 8] pmulhrsw m0, [pw_11585x2] ; stp1_1 mova m3, m2 pmulhrsw m2, [pw__6270x2] ; stp1_2 pmulhrsw m3, [pw_15137x2] ; stp1_3 SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 13, 14, 9 pmulhrsw m13, m10 ; stp1_6 pmulhrsw m14, m10 ; stp1_5 %else BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 SWAP 13, 14 %endif mova m1, m0 ; stp1_0 = stp1_1 SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m4, [stp + %2 + idx12] mova m5, [stp + %2 + idx13] mova m6, [stp + %2 + idx14] mova m7, [stp + %2 + idx15] SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 ; 0-3, 28-31 final stage mova m10, [stp + %4 + idx31] mova m15, [stp + %4 + idx30] SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 mova [stp + %1 + idx0], m0 mova [stp + %1 + idx1], m1 mova [stp + %4 + idx31], m10 mova [stp + %4 + idx30], m15 mova m0, [stp + %4 + idx29] mova m1, [stp + %4 + idx28] SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 mova [stp + %1 + idx2], m2 mova [stp + %1 + idx3], m3 mova [stp + %4 + idx29], m0 mova [stp + %4 + idx28], m1 ; 12-15, 16-19 final stage mova m0, [stp + %3 + idx16] mova m1, [stp + %3 + idx17] mova m2, [stp + %3 + idx18] mova m3, [stp + %3 + idx19] SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 mova [stp + %2 + idx12], m4 mova [stp + %2 + idx13], m5 mova [stp + %2 + idx14], m6 mova [stp + %2 + idx15], m7 mova [stp + %3 + idx16], m0 mova [stp + %3 + idx17], m1 mova [stp + %3 + idx18], m2 mova [stp + %3 + idx19], m3 mova m4, [stp + %2 + idx8] mova m5, [stp + %2 + idx9] mova m6, [stp + %2 + idx10] mova m7, [stp + %2 + idx11] SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 ; 4-7, 24-27 final stage mova m3, [stp + %4 + idx24] mova m2, [stp + %4 + idx25] mova m1, [stp + %4 + idx26] mova m0, [stp + %4 + idx27] SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 mova [stp + %4 + idx24], m3 mova [stp + %4 + idx25], m2 mova [stp + %4 + idx26], m1 mova [stp + %4 + idx27], m0 mova [stp + %1 + idx4], m11 mova [stp + %1 + idx5], m14 mova [stp + %1 + idx6], m13 mova [stp + %1 + idx7], m12 ; 8-11, 20-23 final stage mova m0, [stp + %3 + idx20] mova m1, [stp + %3 + idx21] mova m2, [stp + %3 + idx22] mova m3, [stp + %3 + idx23] SUM_SUB 7, 0, 9 ; stp1_11, stp_20 SUM_SUB 6, 1, 9 ; stp1_10, stp_21 SUM_SUB 5, 2, 9 ; stp1_9, stp_22 SUM_SUB 4, 3, 9 ; stp1_8, stp_23 mova [stp + %2 + idx8], m4 mova [stp + %2 + idx9], m5 mova [stp + %2 + idx10], m6 mova [stp + %2 + idx11], m7 mova [stp + %3 + idx20], m0 mova [stp + %3 + idx21], m1 mova [stp + %3 + idx22], m2 mova [stp + %3 + idx23], m3 %endmacro INIT_XMM ssse3 cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride mova m8, [pd_8192] mov r6, 2 lea stp, [rsp + pass_one_start] idct32x32_135: mov r3, inputq lea r4, [rsp + transposed_in] mov r7, 2 idct32x32_135_transpose: %if CONFIG_VP9_HIGHBITDEPTH mova m0, [r3 + 0] packssdw m0, [r3 + 16] mova m1, [r3 + 32 * 4] packssdw m1, [r3 + 32 * 4 + 16] mova m2, [r3 + 32 * 8] packssdw m2, [r3 + 32 * 8 + 16] mova m3, [r3 + 32 * 12] packssdw m3, [r3 + 32 * 12 + 16] mova m4, [r3 + 32 * 16] packssdw m4, [r3 + 32 * 16 + 16] mova m5, [r3 + 32 * 20] packssdw m5, [r3 + 32 * 20 + 16] mova m6, [r3 + 32 * 24] packssdw m6, [r3 + 32 * 24 + 16] mova m7, [r3 + 32 * 28] packssdw m7, [r3 + 32 * 28 + 16] %else mova m0, [r3 + 0] mova m1, [r3 + 16 * 4] mova m2, [r3 + 16 * 8] mova m3, [r3 + 16 * 12] mova m4, [r3 + 16 * 16] mova m5, [r3 + 16 * 20] mova m6, [r3 + 16 * 24] mova m7, [r3 + 16 * 28] %endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 mova [r4 + 0], m0 mova [r4 + 16 * 1], m1 mova [r4 + 16 * 2], m2 mova [r4 + 16 * 3], m3 mova [r4 + 16 * 4], m4 mova [r4 + 16 * 5], m5 mova [r4 + 16 * 6], m6 mova [r4 + 16 * 7], m7 %if CONFIG_VP9_HIGHBITDEPTH add r3, 32 %else add r3, 16 %endif add r4, 16 * 8 dec r7 jne idct32x32_135_transpose IDCT32X32_135 16*0, 16*32, 16*64, 16*96 lea stp, [stp + 16 * 8] %if CONFIG_VP9_HIGHBITDEPTH lea inputq, [inputq + 32 * 32] %else lea inputq, [inputq + 16 * 32] %endif dec r6 jnz idct32x32_135 mov r6, 4 lea stp, [rsp + pass_one_start] lea r9, [rsp + pass_one_start] idct32x32_135_2: lea r4, [rsp + transposed_in] mov r3, r9 mov r7, 2 idct32x32_135_transpose_2: mova m0, [r3 + 0] mova m1, [r3 + 16 * 1] mova m2, [r3 + 16 * 2] mova m3, [r3 + 16 * 3] mova m4, [r3 + 16 * 4] mova m5, [r3 + 16 * 5] mova m6, [r3 + 16 * 6] mova m7, [r3 + 16 * 7] TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 mova [r4 + 0], m0 mova [r4 + 16 * 1], m1 mova [r4 + 16 * 2], m2 mova [r4 + 16 * 3], m3 mova [r4 + 16 * 4], m4 mova [r4 + 16 * 5], m5 mova [r4 + 16 * 6], m6 mova [r4 + 16 * 7], m7 add r3, 16 * 8 add r4, 16 * 8 dec r7 jne idct32x32_135_transpose_2 IDCT32X32_135 16*0, 16*8, 16*16, 16*24 lea stp, [stp + 16 * 32] add r9, 16 * 32 dec r6 jnz idct32x32_135_2 RECON_AND_STORE pass_two_start RET %macro IDCT32X32_1024 4 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m1, [rsp + transposed_in + 16 * 1] mova m11, [rsp + transposed_in + 16 * 31] BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 mova m0, [rsp + transposed_in + 16 * 15] mova m2, [rsp + transposed_in + 16 * 17] BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 mova m7, [rsp + transposed_in + 16 * 7] mova m12, [rsp + transposed_in + 16 * 25] BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 mova m3, [rsp + transposed_in + 16 * 9] mova m4, [rsp + transposed_in + 16 * 23] BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 mova [stp + %3 + idx16], m1 mova [stp + %3 + idx17], m0 mova [stp + %3 + idx18], m4 mova [stp + %3 + idx19], m7 mova [stp + %4 + idx28], m12 mova [stp + %4 + idx29], m3 mova [stp + %4 + idx30], m2 mova [stp + %4 + idx31], m11 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m5, [rsp + transposed_in + 16 * 5] mova m6, [rsp + transposed_in + 16 * 27] BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 mova m13, [rsp + transposed_in + 16 * 21] mova m14, [rsp + transposed_in + 16 * 11] BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 mova m0, [rsp + transposed_in + 16 * 13] mova m1, [rsp + transposed_in + 16 * 19] BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 mova m2, [rsp + transposed_in + 16 * 3] mova m3, [rsp + transposed_in + 16 * 29] BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m4, [stp + %3 + idx16] mova m7, [stp + %3 + idx17] mova m11, [stp + %3 + idx18] mova m12, [stp + %3 + idx19] SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 mova [stp + %3 + idx16], m4 mova [stp + %3 + idx17], m7 mova [stp + %3 + idx18], m11 mova [stp + %3 + idx19], m12 mova m4, [stp + %4 + idx28] mova m7, [stp + %4 + idx29] mova m11, [stp + %4 + idx30] mova m12, [stp + %4 + idx31] SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 mova [stp + %4 + idx28], m4 mova [stp + %4 + idx29], m7 mova [stp + %4 + idx30], m11 mova [stp + %4 + idx31], m12 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 6, 5, 9 pmulhrsw m6, m10 ; stp1_27 pmulhrsw m5, m10 ; stp1_20 SUM_SUB 13, 14, 9 pmulhrsw m13, m10 ; stp1_26 pmulhrsw m14, m10 ; stp1_21 SUM_SUB 1, 0, 9 pmulhrsw m1, m10 ; stp1_25 pmulhrsw m0, m10 ; stp1_22 SUM_SUB 2, 3, 9 pmulhrsw m2, m10 ; stp1_25 pmulhrsw m3, m10 ; stp1_22 %else BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 SWAP 6, 5 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 SWAP 13, 14 BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 SWAP 1, 0 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 SWAP 2, 3 %endif mova [stp + %3 + idx20], m5 mova [stp + %3 + idx21], m14 mova [stp + %3 + idx22], m0 mova [stp + %3 + idx23], m3 mova [stp + %4 + idx24], m2 mova [stp + %4 + idx25], m1 mova [stp + %4 + idx26], m13 mova [stp + %4 + idx27], m6 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m0, [rsp + transposed_in + 16 * 2] mova m1, [rsp + transposed_in + 16 * 30] BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 mova m2, [rsp + transposed_in + 16 * 14] mova m3, [rsp + transposed_in + 16 * 18] BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 mova m4, [rsp + transposed_in + 16 * 10] mova m5, [rsp + transposed_in + 16 * 22] BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 mova m6, [rsp + transposed_in + 16 * 6] mova m7, [rsp + transposed_in + 16 * 26] BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 5, 4, 9 pmulhrsw m5, m10 ; stp1_13 pmulhrsw m4, m10 ; stp1_10 SUM_SUB 6, 7, 9 pmulhrsw m6, m10 ; stp1_12 pmulhrsw m7, m10 ; stp1_11 %else BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 SWAP 5, 4 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 SWAP 6, 7 %endif ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova [stp + %2 + idx8], m0 mova [stp + %2 + idx9], m2 mova [stp + %2 + idx10], m4 mova [stp + %2 + idx11], m7 mova [stp + %2 + idx12], m6 mova [stp + %2 + idx13], m5 mova [stp + %2 + idx14], m3 mova [stp + %2 + idx15], m1 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m11, [rsp + transposed_in + 16 * 4] mova m12, [rsp + transposed_in + 16 * 28] BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 mova m13, [rsp + transposed_in + 16 * 12] mova m14, [rsp + transposed_in + 16 * 20] BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m0, [rsp + transposed_in + 16 * 0] mova m1, [rsp + transposed_in + 16 * 16] %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] SUM_SUB 0, 1, 9 pmulhrsw m0, m10 ; stp1_1 pmulhrsw m1, m10 ; stp1_0 %else BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 SWAP 0, 1 %endif mova m2, [rsp + transposed_in + 16 * 8] mova m3, [rsp + transposed_in + 16 * 24] BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 mova m10, [pw_11585x2] SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams SUM_SUB 13, 14, 9 pmulhrsw m13, m10 ; stp1_6 pmulhrsw m14, m10 ; stp1_5 %else BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 SWAP 13, 14 %endif SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m4, [stp + %2 + idx12] mova m5, [stp + %2 + idx13] mova m6, [stp + %2 + idx14] mova m7, [stp + %2 + idx15] SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 ; 0-3, 28-31 final stage mova m10, [stp + %4 + idx31] mova m15, [stp + %4 + idx30] SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 mova [stp + %1 + idx0], m0 mova [stp + %1 + idx1], m1 mova [stp + %4 + idx31], m10 mova [stp + %4 + idx30], m15 mova m0, [stp + %4 + idx29] mova m1, [stp + %4 + idx28] SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 mova [stp + %1 + idx2], m2 mova [stp + %1 + idx3], m3 mova [stp + %4 + idx29], m0 mova [stp + %4 + idx28], m1 ; 12-15, 16-19 final stage mova m0, [stp + %3 + idx16] mova m1, [stp + %3 + idx17] mova m2, [stp + %3 + idx18] mova m3, [stp + %3 + idx19] SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 mova [stp + %2 + idx12], m4 mova [stp + %2 + idx13], m5 mova [stp + %2 + idx14], m6 mova [stp + %2 + idx15], m7 mova [stp + %3 + idx16], m0 mova [stp + %3 + idx17], m1 mova [stp + %3 + idx18], m2 mova [stp + %3 + idx19], m3 mova m4, [stp + %2 + idx8] mova m5, [stp + %2 + idx9] mova m6, [stp + %2 + idx10] mova m7, [stp + %2 + idx11] SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 ; 4-7, 24-27 final stage mova m3, [stp + %4 + idx24] mova m2, [stp + %4 + idx25] mova m1, [stp + %4 + idx26] mova m0, [stp + %4 + idx27] SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 mova [stp + %4 + idx24], m3 mova [stp + %4 + idx25], m2 mova [stp + %4 + idx26], m1 mova [stp + %4 + idx27], m0 mova [stp + %1 + idx4], m11 mova [stp + %1 + idx5], m14 mova [stp + %1 + idx6], m13 mova [stp + %1 + idx7], m12 ; 8-11, 20-23 final stage mova m0, [stp + %3 + idx20] mova m1, [stp + %3 + idx21] mova m2, [stp + %3 + idx22] mova m3, [stp + %3 + idx23] SUM_SUB 7, 0, 9 ; stp1_11, stp_20 SUM_SUB 6, 1, 9 ; stp1_10, stp_21 SUM_SUB 5, 2, 9 ; stp1_9, stp_22 SUM_SUB 4, 3, 9 ; stp1_8, stp_23 mova [stp + %2 + idx8], m4 mova [stp + %2 + idx9], m5 mova [stp + %2 + idx10], m6 mova [stp + %2 + idx11], m7 mova [stp + %3 + idx20], m0 mova [stp + %3 + idx21], m1 mova [stp + %3 + idx22], m2 mova [stp + %3 + idx23], m3 %endmacro INIT_XMM ssse3 cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride mova m8, [pd_8192] mov r6, 4 lea stp, [rsp + pass_one_start] idct32x32_1024: mov r3, inputq lea r4, [rsp + transposed_in] mov r7, 4 idct32x32_1024_transpose: %if CONFIG_VP9_HIGHBITDEPTH mova m0, [r3 + 0] packssdw m0, [r3 + 16] mova m1, [r3 + 32 * 4] packssdw m1, [r3 + 32 * 4 + 16] mova m2, [r3 + 32 * 8] packssdw m2, [r3 + 32 * 8 + 16] mova m3, [r3 + 32 * 12] packssdw m3, [r3 + 32 * 12 + 16] mova m4, [r3 + 32 * 16] packssdw m4, [r3 + 32 * 16 + 16] mova m5, [r3 + 32 * 20] packssdw m5, [r3 + 32 * 20 + 16] mova m6, [r3 + 32 * 24] packssdw m6, [r3 + 32 * 24 + 16] mova m7, [r3 + 32 * 28] packssdw m7, [r3 + 32 * 28 + 16] %else mova m0, [r3 + 0] mova m1, [r3 + 16 * 4] mova m2, [r3 + 16 * 8] mova m3, [r3 + 16 * 12] mova m4, [r3 + 16 * 16] mova m5, [r3 + 16 * 20] mova m6, [r3 + 16 * 24] mova m7, [r3 + 16 * 28] %endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 mova [r4 + 0], m0 mova [r4 + 16 * 1], m1 mova [r4 + 16 * 2], m2 mova [r4 + 16 * 3], m3 mova [r4 + 16 * 4], m4 mova [r4 + 16 * 5], m5 mova [r4 + 16 * 6], m6 mova [r4 + 16 * 7], m7 %if CONFIG_VP9_HIGHBITDEPTH add r3, 32 %else add r3, 16 %endif add r4, 16 * 8 dec r7 jne idct32x32_1024_transpose IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 lea stp, [stp + 16 * 8] %if CONFIG_VP9_HIGHBITDEPTH lea inputq, [inputq + 32 * 32] %else lea inputq, [inputq + 16 * 32] %endif dec r6 jnz idct32x32_1024 mov r6, 4 lea stp, [rsp + pass_one_start] lea r9, [rsp + pass_one_start] idct32x32_1024_2: lea r4, [rsp + transposed_in] mov r3, r9 mov r7, 4 idct32x32_1024_transpose_2: mova m0, [r3 + 0] mova m1, [r3 + 16 * 1] mova m2, [r3 + 16 * 2] mova m3, [r3 + 16 * 3] mova m4, [r3 + 16 * 4] mova m5, [r3 + 16 * 5] mova m6, [r3 + 16 * 6] mova m7, [r3 + 16 * 7] TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 mova [r4 + 0], m0 mova [r4 + 16 * 1], m1 mova [r4 + 16 * 2], m2 mova [r4 + 16 * 3], m3 mova [r4 + 16 * 4], m4 mova [r4 + 16 * 5], m5 mova [r4 + 16 * 6], m6 mova [r4 + 16 * 7], m7 add r3, 16 * 8 add r4, 16 * 8 dec r7 jne idct32x32_1024_transpose_2 IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 lea stp, [stp + 16 * 32] add r9, 16 * 32 dec r6 jnz idct32x32_1024_2 RECON_AND_STORE pass_two_start RET %endif