ref: c5a09faf24190d66fbc9d97f6374dbb4dc203b4c
dir: /codec/common/x86/deblock.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* deblock.asm ;* ;* Abstract ;* edge loop ;* ;* History ;* 08/07/2009 Created ;* ;* ;*************************************************************************/ %include "asm_inc.asm" ;******************************************************************************* ; Macros and other preprocessor constants ;******************************************************************************* SECTION .rodata align=16 ALIGN 16 FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4 ALIGN 16 WELS_DB1_16: times 16 db 1 WELS_DB127_16: times 16 db 127 WELS_DB96_16: times 16 db 96 WELS_SHUFB0000111122223333: times 4 db 0 times 4 db 1 times 4 db 2 times 4 db 3 SECTION .text ; Unsigned byte absolute difference. ; a=%1 b=%2 clobber=%3 ; Subtract once in each direction with saturation and return the maximum. %macro SSE2_AbsDiffUB 3 movdqa %3, %2 psubusb %3, %1 psubusb %1, %2 por %1, %3 %endmacro ; Unsigned byte compare less than. ; lhs=%1 rhs^0x7f=%2 0x7f=%3 ; No unsigned byte lt/gt compare instruction available; xor by 0x7f and use a ; signed compare. Some other options do exist. This one allows modifying the lhs ; without mov and uses a bitwise op which can be executed on most ports on ; common architectures. %macro SSE2_CmpltUB 3 pxor %1, %3 pcmpgtb %1, %2 %endmacro ; Unsigned byte compare greater than or equal. %macro SSE2_CmpgeUB 2 pminub %1, %2 pcmpeqb %1, %2 %endmacro ; Clip unsigned bytes to ref +/- diff. ; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4 %macro SSE2_ClipUB 4 movdqa %4, %2 psubusb %4, %3 paddusb %3, %2 pmaxub %1, %4 pminub %1, %3 %endmacro ; (a + b + 1 - c) >> 1 ; a=%1 b=%2 c=%3 [out:a^b&c]=%4 %macro SSE2_AvgbFloor1 4 movdqa %4, %1 pxor %4, %2 pavgb %1, %2 pand %4, %3 psubb %1, %4 %endmacro ; (a + b + carry) >> 1 ; a=%1 b=%2 carry-1=%3 %macro SSE2_AvgbFloor2 3 pxor %1, %3 pxor %2, %3 pavgb %1, %2 pxor %1, %3 %endmacro ; a = (a & m) | (b & ~m) ; a=%1 b=%2 m=%3 %macro SSE2_Blend 3 pand %1, %3 pandn %3, %2 por %1, %3 %endmacro ; Compute ; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255) ; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255) ; 16-wide parallel in packed byte representation in xmm registers. ; ; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8 %macro SSE2_DeblockP0Q0_Lt4 8 ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255]. ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later. ; Bias so that unsigned saturation can be used. ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs. ; q0 - p0 is split into a non-negative and non-positive part. The latter is ; subtracted from the biased value. movdqa %7, %2 psubusb %7, %3 ; clip(p0 - q0, 0, 255) ; ((p1 - q1) >> 2) + 0xc0 pxor %4, %6 ; q1 ^ 0xff aka -q1 - 1 & 0xff pavgb %1, %4 ; (((p1 - q1 + 0x100) >> 1) pavgb %1, %6 ; + 0x100) >> 1 psubusb %1, %7 ; -= clip(p0 - q0, 0, 255) saturate. movdqa %8, %3 psubusb %8, %2 ; (clip(q0 - p0, 0, 255) pavgb %8, %1 ; + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1 ; Unbias and split into a non-negative and a non-positive part. ; Clip each part to iTc via minub. ; Add/subtract each part to/from p0/q0 and clip. %ifdef X86_32_PICASM push r0 mov r0, esp sub esp, 16 and esp, -16 push 0x60606060 ;WELS_DB96_16 push 0x60606060 push 0x60606060 push 0x60606060 movdqa %6, [esp] psubusb %6, %8 psubusb %8, [esp] mov esp, r0 pop r0 %else movdqa %6, [WELS_DB96_16] psubusb %6, %8 psubusb %8, [WELS_DB96_16] %endif pminub %6, %5 pminub %8, %5 psubusb %2, %6 paddusb %2, %8 ; p0 paddusb %3, %6 psubusb %3, %8 ; q0 %endmacro ;******************************************************************************* ; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha, ; int32_t iBeta, int8_t * pTC) ;******************************************************************************* WELS_EXTERN DeblockLumaLt4V_ssse3 %assign push_num 0 LOAD_5_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d movd xmm1, arg3d movd xmm2, arg4d pxor xmm3, xmm3 %ifdef X86_32_PICASM push r4 mov r4, esp sub esp, 16 and esp, -16 push 0x7f7f7f7f push 0x7f7f7f7f push 0x7f7f7f7f push 0x7f7f7f7f pxor xmm1, [esp] pxor xmm2, [esp] %else pxor xmm1, [WELS_DB127_16] pxor xmm2, [WELS_DB127_16] %endif pshufb xmm1, xmm3 ; iAlpha ^ 0x7f pshufb xmm2, xmm3 ; iBeta ^ 0x7f mov r2, r1 ; iStride neg r1 ; -iStride lea r3, [r0 + r1] ; pPix - iStride ; Compute masks to enable/disable deblocking. MOVDQ xmm6, [r3 + 0 * r1] ; p0 MOVDQ xmm7, [r3 + 1 * r1] ; p1 MOVDQ xmm0, [r0 + 0 * r2] ; q0 movdqa xmm4, xmm6 SSE2_AbsDiffUB xmm6, xmm0, xmm3 ; |p0 - q0| %ifdef X86_32_PICASM SSE2_CmpltUB xmm6, xmm1, [esp] ; bDeltaP0Q0 = |p0 - q0| < iAlpha %else SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha %endif MOVDQ xmm1, [r0 + 1 * r2] ; q1 SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p1 - p0| SSE2_AbsDiffUB xmm0, xmm1, xmm3 ; |q1 - q0| pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|) %ifdef X86_32_PICASM SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta %else SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta %endif pand xmm6, xmm7 ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0 MOVDQ xmm7, [r3 + 2 * r1] ; p2 movdqa xmm0, xmm7 SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p2 - p0| %ifdef X86_32_PICASM SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP2P0 = |p2 - p0| < iBeta %else SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta %endif MOVDQ xmm5, [r0 + 2 * r2] ; q2 MOVDQ xmm3, [r0 + 0 * r2] ; q0 movdqa xmm1, xmm5 SSE2_AbsDiffUB xmm5, xmm3, xmm4 ; |q2 - q0| %ifdef X86_32_PICASM SSE2_CmpltUB xmm5, xmm2, [esp] ; bDeltaQ2Q0 = |q2 - q0| < iBeta mov esp, r4 pop r4 %else SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta %endif pavgb xmm3, [r3 + 0 * r1] pcmpeqw xmm2, xmm2 ; FFh pxor xmm3, xmm2 ; (p2 + ((p0 + q0 + 1) >> 1)) >> 1 pxor xmm0, xmm2 pavgb xmm0, xmm3 pxor xmm0, xmm2 ; (q2 + ((p0 + q0 + 1) >> 1)) >> 1 pxor xmm1, xmm2 pavgb xmm1, xmm3 pxor xmm1, xmm2 movd xmm3, [r4] %ifdef X86_32_PICASM push r0 mov r0, esp sub esp, 16 and esp, -16 push 0x03030303 ;WELS_SHUFB0000111122223333 push 0x02020202 push 0x01010101 push 0x00000000 pshufb xmm3, [esp] ; iTc mov esp, r0 pop r0 %else pshufb xmm3, [WELS_SHUFB0000111122223333] ; iTc %endif movdqa xmm4, xmm3 ; iTc0 = iTc pcmpgtb xmm3, xmm2 ; iTc > -1 ? 0xff : 0x00 pand xmm6, xmm3 ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1 movdqa xmm3, xmm4 psubb xmm3, xmm7 ; iTc -= bDeltaP2P0 ? -1 : 0 psubb xmm3, xmm5 ; iTc -= bDeltaQ2Q0 ? -1 : 0 pand xmm3, xmm6 ; iTc &= bDeltaP0Q0P1P0Q1Q0 ? 0xff : 0 pand xmm7, xmm6 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0 pand xmm5, xmm6 ; bDeltaQ2Q0 &= bDeltaP0Q0P1P0Q1Q0 pand xmm7, xmm4 ; iTc0 & (bDeltaP2P0 ? 0xff : 0) pand xmm5, xmm4 ; iTc0 & (bDeltaQ2Q0 ? 0xff : 0) MOVDQ xmm4, [r3 + 1 * r1] SSE2_ClipUB xmm0, xmm4, xmm7, xmm6 ; clip p1. MOVDQ xmm6, [r0 + 1 * r2] MOVDQ [r3 + 1 * r1], xmm0 ; store p1. SSE2_ClipUB xmm1, xmm6, xmm5, xmm7 ; clip q1. MOVDQ [r0 + 1 * r2], xmm1 ; store q1. MOVDQ xmm1, [r3 + 0 * r1] ; p0 MOVDQ xmm0, [r0 + 0 * r2] ; q0 SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7 MOVDQ [r3 + 0 * r1], xmm1 ; store p0. MOVDQ [r0 + 0 * r2], xmm0 ; store q0. POP_XMM LOAD_5_PARA_POP ret ; Deblock 3x16 luma pixels for the eq4 case. ; ; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be ; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1, ; which maps to 3 pavgbs. ; ; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12 %macro SSE2_DeblockLumaEq4_3x16P 12 movdqa %7, %3 movdqa %8, %6 MOVDQ %10, [%1 + 1 * %2] ; p1 SSE2_Blend %7, %10, %8 ; t0 = bDeltaP2P0 ? q0 : p1 movdqa %8, %6 MOVDQ %9, [%1 + 2 * %2] ; p2 SSE2_Blend %9, %4, %8 ; t1 = bDeltaP2P0 ? p2 : q1 SSE2_AvgbFloor1 %4, %9, %12, %8 ; t1 = (t1 + q1) >> 1 SSE2_AvgbFloor1 %10, [%1], %12, %8 ; (p0 + p1) >> 1, p0 ^ p1 pxor %8, %12 SSE2_AvgbFloor1 %7, %4, %8, %9 ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1 MOVDQ %9, [%1 + 2 * %2] ; p2 SSE2_AvgbFloor1 %3, %9, %8, %4 ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1 pavgb %7, %10 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2 movdqa %8, %10 pxor %8, %3 ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 pand %8, %12 ; & 1 pavgb %10, %3 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2 pand %6, %5 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0 %if %11 MOVDQ %3, [%1 + 0 * %2] ; p0 movdqa %4, %5 SSE2_Blend %7, %3, %4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 %else SSE2_Blend %7, [%1 + 0 * %2], %5 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 %endif MOVDQ [%1 + 0 * %2], %7 ; store p0 add %1, %2 movdqa %7, %10 psubb %10, %8 ; (p0 + p1 + p2 + q0) >> 2 psubb %8, %12 MOVDQ %4, [%1 + (3 - 1) * %2] ; p3 SSE2_AvgbFloor2 %4, %9, %8 ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1 pavgb %10, %4 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2 movdqa %8, %6 SSE2_Blend %10, [%1 + (2 - 1) * %2], %8 ; p2out = bDeltaP2P0 ? p2' : p2 MOVDQ [%1 + (2 - 1) * %2], %10 ; store p2 %if %11 MOVDQ %4, [%1 + (1 - 1) * %2] ; p1 SSE2_Blend %7, %4, %6 ; p1out = bDeltaP2P0 ? p1' : p1 %else SSE2_Blend %7, [%1 + (1 - 1) * %2], %6 ; p1out = bDeltaP2P0 ? p1' : p1 %endif MOVDQ [%1 + (1 - 1) * %2], %7 ; store p1 %endmacro ;******************************************************************************* ; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha, ; int32_t iBeta) ;******************************************************************************* WELS_EXTERN DeblockLumaEq4V_ssse3 %assign push_num 0 LOAD_4_PARA PUSH_XMM 10 SIGN_EXTENSION r1, r1d movd xmm1, arg3d movd xmm2, arg4d shr r2, 2 add r2, 1 movd xmm3, r2d pxor xmm4, xmm4 %ifdef X86_32_PICASM push r4 mov r4, esp sub esp, 16 and esp, -16 push 0x7f7f7f7f ;WELS_DB127_16 push 0x7f7f7f7f push 0x7f7f7f7f push 0x7f7f7f7f pxor xmm1, [esp] pxor xmm2, [esp] %else pxor xmm1, [WELS_DB127_16] pxor xmm2, [WELS_DB127_16] %endif pshufb xmm1, xmm4 ; iAlpha ^ 0x7f pshufb xmm2, xmm4 ; iBeta ^ 0x7f pshufb xmm3, xmm4 ; (iAlpha >> 2) + 1 mov r2, r1 ; iStride neg r1 ; -iStride lea r3, [r0 + r1] ; pPix - iStride ; Compute masks to enable/disable filtering. MOVDQ xmm7, [r3 + 1 * r1] ; p1 MOVDQ xmm6, [r3 + 0 * r1] ; p0 MOVDQ xmm0, [r0 + 0 * r2] ; q0 movdqa xmm4, xmm6 SSE2_AbsDiffUB xmm6, xmm0, xmm5 ; |p0 - q0| SSE2_CmpgeUB xmm3, xmm6 ; |p0 - q0| < (iAlpha >> 2) + 2 %ifdef X86_32_PICASM SSE2_CmpltUB xmm6, xmm1, [esp] ; bDeltaP0Q0 = |p0 - q0| < iAlpha %else SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha %endif MOVDQ xmm1, [r0 + 1 * r2] ; q1 SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p1 - p0| SSE2_AbsDiffUB xmm0, xmm1, xmm5 ; |q1 - q0| pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|) %ifdef X86_32_PICASM SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta %else SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta %endif pand xmm6, xmm7 ; & bDeltaP0Q0 MOVDQ xmm7, [r3 + 2 * r1] ; p2 SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p2 - p0| %ifdef X86_32_PICASM SSE2_CmpltUB xmm7, xmm2, [esp] ; bDeltaP2P0 = |p2 - p0| < iBeta %else SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta %endif pand xmm7, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2 MOVDQ xmm0, [r0 + 0 * r2] ; q0 MOVDQ xmm5, [r0 + 2 * r2] ; q2 SSE2_AbsDiffUB xmm5, xmm0, xmm4 ; |q2 - q0| %ifdef X86_32_PICASM SSE2_CmpltUB xmm5, xmm2, [esp] ; bDeltaQ2Q0 = |q2 - q0| < iBeta mov esp, r4 pop r4 %else SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta %endif pand xmm5, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2 %ifdef X86_32 ; Push xmm5 to free up one register. Align stack so as to ensure that failed ; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB). mov r2, esp sub esp, 16 and esp, -16 %ifdef X86_32_PICASM push 0x01010101 push 0x01010101 push 0x01010101 push 0x01010101 sub esp, 16 movdqa [esp], xmm5 SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [esp+16] movdqa xmm5, [esp] neg r1 SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [esp+16] mov esp, r2 %else movdqa [esp], xmm5 SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16] movdqa xmm5, [esp] mov esp, r2 neg r1 SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16] %endif %else movdqa xmm9, [WELS_DB1_16] SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9 SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9 %endif POP_XMM LOAD_4_PARA_POP ret ; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11 %macro SSE2_LoadCbCr_4x16H 11 movd %1, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 0 movd %2, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 2 punpcklbw %1, %2 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2 movd %2, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 4 movd %9, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 6 punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6 punpcklwd %1, %2 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6 movd %2, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 0 movd %9, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 2 punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2 movd %9, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 4 movd %10, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 6 punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6 punpcklwd %2, %9 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6 add %5, %7 ; pPixCb += iStride add %6, %7 ; pPixCr += iStride movd %9, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 1 movd %10, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 3 punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3 movd %10, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 5 movd %3, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 7 punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7 punpcklwd %9, %10 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7 movd %10, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 1 movd %3, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 3 punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3 movd %3, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 5 movd %4, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 7 punpcklbw %3, %4 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7 punpcklwd %10, %3 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7 movdqa %3, %1 punpckldq %1, %2 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6 punpckhdq %3, %2 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6 movdqa %11, %9 punpckldq %9, %10 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7 punpckhdq %11, %10 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7 movdqa %2, %1 punpcklqdq %1, %9 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7 punpckhqdq %2, %9 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7 movdqa %4, %3 punpcklqdq %3, %11 ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7 punpckhqdq %4, %11 ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7 %endmacro ; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10 %macro SSE2_StoreCbCr_4x16H 10 movdqa %10, %5 punpcklbw %10, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 punpckhbw %5, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 mov %7, r7 ; preserve stack pointer and r7, -16 ; align stack pointer sub r7, 32 ; allocate stack space movdqa [r7 ], %10 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack movdqa [r7 + 16], %5 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack mov %8, [r7 + 16] ; [p0,q0,p0,q0] cb line 1,3 mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 1 shr %8, 16 ; [p0,q0] cb line 3 mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 3 mov %8, [r7 + 20] ; [p0,q0,p0,q0] cb line 5,7 mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 5 shr %8, 16 ; [p0,q0] cb line 7 mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 7 mov %8, [r7 + 24] ; [p0,q0,p0,q0] cr line 1,3 mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 1 shr %8, 16 ; [p0,q0] cr line 3 mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 3 mov %8, [r7 + 28] ; [p0,q0,p0,q0] cr line 5,7 mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 5 shr %8, 16 ; [p0,q0] cr line 7 mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 7 sub %1, %3 ; pPixCb -= iStride sub %2, %3 ; pPixCr -= iStride mov %8, [r7 ] ; [p0,q0,p0,q0] cb line 0,2 mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 0 shr %8, 16 ; [p0,q0] cb line 2 mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 2 mov %8, [r7 + 4] ; [p0,q0,p0,q0] cb line 4,6 mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 4 shr %8, 16 ; [p0,q0] cb line 6 mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 6 mov %8, [r7 + 8] ; [p0,q0,p0,q0] cr line 0,2 mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 0 shr %8, 16 ; [p0,q0] cr line 2 mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 2 mov %8, [r7 + 12] ; [p0,q0,p0,q0] cr line 4,6 mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 4 shr %8, 16 ; [p0,q0] cr line 6 mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 6 mov r7, %7 ; restore stack pointer %endmacro ; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11 %macro SSSE3_DeblockChromaLt4 11 movdqa %8, %3 SSE2_AbsDiffUB %8, %2, %9 ; |p0 - q0| SSE2_CmpgeUB %8, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha movdqa %9, %4 SSE2_AbsDiffUB %9, %3, %5 ; |q1 - q0| movdqa %10, %1 SSE2_AbsDiffUB %10, %2, %5 ; |p1 - p0| pmaxub %9, %10 ; max(|q1 - q0|, |p1 - p0|) pxor %10, %10 movd %5, %6 pshufb %5, %10 ; iBeta SSE2_CmpgeUB %9, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta por %8, %9 ; | !bDeltaP0Q0 movd %5, [%7] %if %11 punpckldq %5, %5 punpcklbw %5, %5 ; iTc %else pshufd %5, %5, 0 ; iTc %endif pcmpeqw %10, %10 ; FFh movdqa %9, %5 pcmpgtb %9, %10 ; iTc > -1 ? FFh : 00h pandn %8, %5 ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0 pand %8, %9 ; &= (iTc > -1 ? FFh : 00h) SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9 %endmacro ; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9 %macro SSSE3_DeblockChromaEq4 9 movdqa %7, %3 SSE2_AbsDiffUB %7, %2, %8 ; |p0 - q0| SSE2_CmpgeUB %7, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha movdqa %8, %4 SSE2_AbsDiffUB %8, %3, %5 ; |q1 - q0| movdqa %9, %1 SSE2_AbsDiffUB %9, %2, %5 ; |p1 - p0| pmaxub %8, %9 ; max(|q1 - q0|, |p1 - p0|) pxor %9, %9 movd %5, %6 pshufb %5, %9 ; iBeta SSE2_CmpgeUB %8, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta por %7, %8 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0 WELS_DB1 %5 movdqa %8, %2 SSE2_AvgbFloor1 %8, %4, %5, %9 ; (p0 + q1) >> 1 pavgb %8, %1 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1 movdqa %9, %7 SSE2_Blend %2, %8, %7 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 SSE2_AvgbFloor1 %1, %3, %5, %7 ; (q0 + p1) >> 1 pavgb %1, %4 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1 SSE2_Blend %3, %1, %9 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0 %endmacro ;****************************************************************************** ; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, ; int32_t iAlpha, int32_t iBeta, int8_t * pTC); ;******************************************************************************* WELS_EXTERN DeblockChromaLt4V_ssse3 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r2, r2d movd xmm7, arg4d pxor xmm0, xmm0 pshufb xmm7, xmm0 ; iAlpha mov r3, r2 neg r3 ; -iStride movq xmm0, [r0 + 0 * r2] ; q0 cb movhps xmm0, [r1 + 0 * r2] ; q0 cr movq xmm2, [r0 + 1 * r3] ; p0 cb movhps xmm2, [r1 + 1 * r3] ; p0 cr movq xmm1, [r0 + 1 * r2] ; q1 cb movhps xmm1, [r1 + 1 * r2] ; q1 cr movq xmm3, [r0 + 2 * r3] ; p1 cb movhps xmm3, [r1 + 2 * r3] ; p1 cr %ifidni arg6, r5 SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1 %else mov r2, arg6 SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2, xmm4, xmm5, xmm6, 1 %endif movlps [r0 + 1 * r3], xmm2 ; store p0 cb movhps [r1 + 1 * r3], xmm2 ; store p0 cr movlps [r0 ], xmm0 ; store q0 cb movhps [r1 ], xmm0 ; store q0 cr POP_XMM LOAD_4_PARA_POP ret ;******************************************************************************** ; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, ; int32_t iAlpha, int32_t iBeta) ;******************************************************************************** WELS_EXTERN DeblockChromaEq4V_ssse3 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r2, r2d movd xmm7, arg4d pxor xmm0, xmm0 pshufb xmm7, xmm0 ; iAlpha mov r3, r2 neg r3 ; -iStride movq xmm0, [r0 + 0 * r2] ; q0 cb movhps xmm0, [r1 + 0 * r2] ; q0 cr movq xmm2, [r0 + 1 * r3] ; p0 cb movhps xmm2, [r1 + 1 * r3] ; p0 cr movq xmm1, [r0 + 1 * r2] ; q1 cb movhps xmm1, [r1 + 1 * r2] ; q1 cr movq xmm3, [r0 + 2 * r3] ; p1 cb movhps xmm3, [r1 + 2 * r3] ; p1 cr SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6 movlps [r0 + 1 * r3], xmm2 ; store p0 cb movhps [r1 + 1 * r3], xmm2 ; store p0 cr movlps [r0 + 0 * r2], xmm0 ; store q0 cb movhps [r1 + 0 * r2], xmm0 ; store q0 cr POP_XMM LOAD_4_PARA_POP ret ;******************************************************************************* ; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, ; int32_t iAlpha, int32_t iBeta, int8_t * pTC); ;******************************************************************************* WELS_EXTERN DeblockChromaLt4H_ssse3 %assign push_num 0 LOAD_6_PARA PUSH_XMM 8 SIGN_EXTENSION r2, r2d movd xmm7, arg4d pxor xmm0, xmm0 pshufb xmm7, xmm0 ; iAlpha lea r3, [3 * r2 - 1] ; 3 * iStride - 1 SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6 SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0 SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 POP_XMM LOAD_6_PARA_POP ret ;*************************************************************************** ; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, ; int32_t iAlpha, int32_t iBeta) ;*************************************************************************** WELS_EXTERN DeblockChromaEq4H_ssse3 %assign push_num 0 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r2, r2d movd xmm7, arg4d pxor xmm0, xmm0 pshufb xmm7, xmm0 ; iAlpha lea r3, [3 * r2 - 1] ; 3 * iStride - 1 SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6 SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6 %ifdef X86_32 push r4 push r5 SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 pop r5 pop r4 %else SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 %endif POP_XMM LOAD_4_PARA_POP ret ;******************************************************************************** ; ; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst); ; ;******************************************************************************** WELS_EXTERN DeblockLumaTransposeH2V_sse2 push r3 push r4 push r5 %assign push_num 3 LOAD_3_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d mov r5, r7 mov r3, r7 and r3, 0Fh sub r7, r3 sub r7, 10h lea r3, [r0 + r1 * 8] lea r4, [r1 * 3] movq xmm0, [r0] movq xmm7, [r3] punpcklqdq xmm0, xmm7 movq xmm1, [r0 + r1] movq xmm7, [r3 + r1] punpcklqdq xmm1, xmm7 movq xmm2, [r0 + r1*2] movq xmm7, [r3 + r1*2] punpcklqdq xmm2, xmm7 movq xmm3, [r0 + r4] movq xmm7, [r3 + r4] punpcklqdq xmm3, xmm7 lea r0, [r0 + r1 * 4] lea r3, [r3 + r1 * 4] movq xmm4, [r0] movq xmm7, [r3] punpcklqdq xmm4, xmm7 movq xmm5, [r0 + r1] movq xmm7, [r3 + r1] punpcklqdq xmm5, xmm7 movq xmm6, [r0 + r1*2] movq xmm7, [r3 + r1*2] punpcklqdq xmm6, xmm7 movdqa [r7], xmm0 movq xmm7, [r0 + r4] movq xmm0, [r3 + r4] punpcklqdq xmm7, xmm0 movdqa xmm0, [r7] SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7] ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 movdqa [r2], xmm4 movdqa [r2 + 10h], xmm2 movdqa [r2 + 20h], xmm3 movdqa [r2 + 30h], xmm7 movdqa [r2 + 40h], xmm5 movdqa [r2 + 50h], xmm1 movdqa [r2 + 60h], xmm6 movdqa [r2 + 70h], xmm0 mov r7, r5 POP_XMM pop r5 pop r4 pop r3 ret ;******************************************************************************************* ; ; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc); ; ;******************************************************************************************* WELS_EXTERN DeblockLumaTransposeV2H_sse2 push r3 push r4 %assign push_num 2 LOAD_3_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d mov r4, r7 mov r3, r7 and r3, 0Fh sub r7, r3 sub r7, 10h movdqa xmm0, [r2] movdqa xmm1, [r2 + 10h] movdqa xmm2, [r2 + 20h] movdqa xmm3, [r2 + 30h] movdqa xmm4, [r2 + 40h] movdqa xmm5, [r2 + 50h] movdqa xmm6, [r2 + 60h] movdqa xmm7, [r2 + 70h] SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7] ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 lea r2, [r1 * 3] movq [r0], xmm4 movq [r0 + r1], xmm2 movq [r0 + r1*2], xmm3 movq [r0 + r2], xmm7 lea r0, [r0 + r1*4] movq [r0], xmm5 movq [r0 + r1], xmm1 movq [r0 + r1*2], xmm6 movq [r0 + r2], xmm0 psrldq xmm4, 8 psrldq xmm2, 8 psrldq xmm3, 8 psrldq xmm7, 8 psrldq xmm5, 8 psrldq xmm1, 8 psrldq xmm6, 8 psrldq xmm0, 8 lea r0, [r0 + r1*4] movq [r0], xmm4 movq [r0 + r1], xmm2 movq [r0 + r1*2], xmm3 movq [r0 + r2], xmm7 lea r0, [r0 + r1*4] movq [r0], xmm5 movq [r0 + r1], xmm1 movq [r0 + r1*2], xmm6 movq [r0 + r2], xmm0 mov r7, r4 POP_XMM pop r4 pop r3 ret WELS_EXTERN WelsNonZeroCount_sse2 %assign push_num 0 LOAD_1_PARA movdqu xmm0, [r0] movq xmm1, [r0+16] WELS_DB1 xmm2 pminub xmm0, xmm2 pminub xmm1, xmm2 movdqu [r0], xmm0 movq [r0+16], xmm1 ret