ref: c30d5703e5078337b7e01213145ae758d74df450
dir: /vp8/common/x86/postproc_mmx.asm/
; ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst, ; int pitch, int rows, int cols,int flimit) extern sym(vp8_rv) global sym(vp8_mbpost_proc_down_mmx) PRIVATE sym(vp8_mbpost_proc_down_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 136 ; unsigned char d[16][8] at [rsp] ; create flimit2 at [rsp+128] mov eax, dword ptr arg(4) ;flimit mov [rsp+128], eax mov [rsp+128+4], eax %define flimit2 [rsp+128] %if ABI_IS_32BIT=0 lea r8, [GLOBAL(sym(vp8_rv))] %endif ;rows +=8; add dword ptr arg(2), 8 ;for(c=0; c<cols; c+=4) .loop_col: mov rsi, arg(0) ;s pxor mm0, mm0 ; movsxd rax, dword ptr arg(1) ;pitch ; ; this copies the last row down into the border 8 rows mov rdi, rsi mov rdx, arg(2) sub rdx, 9 imul rdx, rax lea rdi, [rdi+rdx] movq mm1, QWORD ptr[rdi] ; first row mov rcx, 8 .init_borderd ; initialize borders lea rdi, [rdi + rax] movq [rdi], mm1 dec rcx jne .init_borderd neg rax ; rax = -pitch ; this copies the first row up into the border 8 rows mov rdi, rsi movq mm1, QWORD ptr[rdi] ; first row mov rcx, 8 .init_border ; initialize borders lea rdi, [rdi + rax] movq [rdi], mm1 dec rcx jne .init_border lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] neg rax pxor mm5, mm5 pxor mm6, mm6 ; pxor mm7, mm7 ; mov rdi, rsi mov rcx, 15 ; .loop_initvar: movd mm1, DWORD PTR [rdi]; punpcklbw mm1, mm0 ; paddw mm5, mm1 ; pmullw mm1, mm1 ; movq mm2, mm1 ; punpcklwd mm1, mm0 ; punpckhwd mm2, mm0 ; paddd mm6, mm1 ; paddd mm7, mm2 ; lea rdi, [rdi+rax] ; dec rcx jne .loop_initvar ;save the var and sum xor rdx, rdx .loop_row: movd mm1, DWORD PTR [rsi] ; [s-pitch*8] movd mm2, DWORD PTR [rdi] ; [s+pitch*7] punpcklbw mm1, mm0 punpcklbw mm2, mm0 paddw mm5, mm2 psubw mm5, mm1 pmullw mm2, mm2 movq mm4, mm2 punpcklwd mm2, mm0 punpckhwd mm4, mm0 paddd mm6, mm2 paddd mm7, mm4 pmullw mm1, mm1 movq mm2, mm1 punpcklwd mm1, mm0 psubd mm6, mm1 punpckhwd mm2, mm0 psubd mm7, mm2 movq mm3, mm6 pslld mm3, 4 psubd mm3, mm6 movq mm1, mm5 movq mm4, mm5 pmullw mm1, mm1 pmulhw mm4, mm4 movq mm2, mm1 punpcklwd mm1, mm4 punpckhwd mm2, mm4 movq mm4, mm7 pslld mm4, 4 psubd mm4, mm7 psubd mm3, mm1 psubd mm4, mm2 psubd mm3, flimit2 psubd mm4, flimit2 psrad mm3, 31 psrad mm4, 31 packssdw mm3, mm4 packsswb mm3, mm0 movd mm1, DWORD PTR [rsi+rax*8] movq mm2, mm1 punpcklbw mm1, mm0 paddw mm1, mm5 mov rcx, rdx and rcx, 127 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 push rax lea rax, [GLOBAL(sym(vp8_rv))] movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] pop rax %elif ABI_IS_32BIT=0 movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] %else movq mm4, [sym(vp8_rv) + rcx*2] %endif paddw mm1, mm4 psraw mm1, 4 packuswb mm1, mm0 pand mm1, mm3 pandn mm3, mm2 por mm1, mm3 and rcx, 15 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] mov rcx, rdx sub rcx, 8 and rcx, 15 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] movd [rsi], mm1 lea rsi, [rsi+rax] lea rdi, [rdi+rax] add rdx, 1 cmp edx, dword arg(2) ;rows jl .loop_row add dword arg(0), 4 ; s += 4 sub dword arg(3), 4 ; cols -= 4 cmp dword arg(3), 0 jg .loop_col add rsp, 136 pop rsp ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret %undef flimit2 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, ; unsigned char blackclamp[16], ; unsigned char whiteclamp[16], ; unsigned char bothclamp[16], ; unsigned int Width, unsigned int Height, int Pitch) extern sym(rand) global sym(vp8_plane_add_noise_mmx) PRIVATE sym(vp8_plane_add_noise_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 8 GET_GOT rbx push rsi push rdi ; end prolog .addnoise_loop: call sym(rand) WRT_PLT mov rcx, arg(1) ;noise and rax, 0xff add rcx, rax ; we rely on the fact that the clamping vectors are stored contiguously ; in black/white/both order. Note that we have to reload this here because ; rdx could be trashed by rand() mov rdx, arg(2) ; blackclamp mov rdi, rcx movsxd rcx, dword arg(5) ;[Width] mov rsi, arg(0) ;Pos xor rax,rax .addnoise_nextset: movq mm1,[rsi+rax] ; get the source psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise paddusb mm1, [rdx+32] ;bothclamp psubusb mm1, [rdx+16] ;whiteclamp movq mm2,[rdi+rax] ; get the noise for this line paddb mm1,mm2 ; add it in movq [rsi+rax],mm1 ; store the result add rax,8 ; move to the next line cmp rax, rcx jl .addnoise_nextset movsxd rax, dword arg(7) ; Pitch add arg(0), rax ; Start += Pitch sub dword arg(6), 1 ; Height -= 1 jg .addnoise_loop ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret SECTION_RODATA align 16 Blur: times 16 dw 16 times 8 dw 64 times 16 dw 16 times 8 dw 0 rd: times 4 dw 0x40