ref: 3cf52554f771fcdf247904fba9714270c7f9e6c1
parent: 918b211990ec2ab891cac748aa3561d5b0db74f8
author: Martin Storsjö <[email protected]>
date: Fri Mar 14 06:29:53 EDT 2014
Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64 According to the Win64 ABI, these registers need to be preserved, and compilers are allowed to rely on their content to stay available - not only for float usage but for any usage, anywhere, in the calling C++ code. This adds a macro which pushes the clobbered registers onto the stack if targeting win64 (and a matching one which restores them). The parameter to the macro is the number of xmm registers used (e.g. if using xmm0 - xmm7, the parameter is 8), or in other words, the number of the highest xmm register used plus one. This is similar to how the same issue is handled for the NEON registers q4-q7 with the vpush instruction, except that they needed to be preserved on all platforms, not only on one particular platform. This allows removing the XMMREG_PROTECT_* hacks, which can easily fail if the compiler chooses to use the callee saved xmm registers in an unexpected spot.
--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -335,6 +335,82 @@
%endif
%endmacro
+%macro PUSH_XMM 1
+ %ifdef WIN64
+ %assign xmm_num_regs %1
+ %if xmm_num_regs > 6
+ %ifdef push_num
+ %assign push_num push_num+2*(%1-6)
+ %endif
+ sub rsp, 16*(%1 - 6)
+ movdqu [rsp], xmm6
+ %endif
+ %if xmm_num_regs > 7
+ movdqu [rsp+16], xmm7
+ %endif
+ %if xmm_num_regs > 8
+ movdqu [rsp+32], xmm8
+ %endif
+ %if xmm_num_regs > 9
+ movdqu [rsp+48], xmm9
+ %endif
+ %if xmm_num_regs > 10
+ movdqu [rsp+64], xmm10
+ %endif
+ %if xmm_num_regs > 11
+ movdqu [rsp+80], xmm11
+ %endif
+ %if xmm_num_regs > 12
+ movdqu [rsp+96], xmm12
+ %endif
+ %if xmm_num_regs > 13
+ movdqu [rsp+112], xmm13
+ %endif
+ %if xmm_num_regs > 14
+ movdqu [rsp+128], xmm14
+ %endif
+ %if xmm_num_regs > 15
+ movdqu [rsp+144], xmm15
+ %endif
+ %endif
+%endmacro
+
+%macro POP_XMM 0
+ %ifdef WIN64
+ %if xmm_num_regs > 15
+ movdqu xmm15, [rsp+144]
+ %endif
+ %if xmm_num_regs > 14
+ movdqu xmm14, [rsp+128]
+ %endif
+ %if xmm_num_regs > 13
+ movdqu xmm13, [rsp+112]
+ %endif
+ %if xmm_num_regs > 12
+ movdqu xmm12, [rsp+96]
+ %endif
+ %if xmm_num_regs > 11
+ movdqu xmm11, [rsp+80]
+ %endif
+ %if xmm_num_regs > 10
+ movdqu xmm10, [rsp+64]
+ %endif
+ %if xmm_num_regs > 9
+ movdqu xmm9, [rsp+48]
+ %endif
+ %if xmm_num_regs > 8
+ movdqu xmm8, [rsp+32]
+ %endif
+ %if xmm_num_regs > 7
+ movdqu xmm7, [rsp+16]
+ %endif
+ %if xmm_num_regs > 6
+ movdqu xmm6, [rsp]
+ add rsp, 16*(xmm_num_regs - 6)
+ %endif
+ %endif
+%endmacro
+
%macro SIGN_EXTENSION 2
%ifndef X86_32
movsx %1, %2
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -65,6 +65,7 @@
DeblockLumaLt4V_ssse3:
push rbp
mov r11,[rsp + 16 + 20h] ; pTC
+ PUSH_XMM 16
sub rsp,1B0h
lea rbp,[rsp+20h]
movd xmm4,r8d
@@ -313,6 +314,7 @@
movdqa [r12+rcx],xmm0
mov r12,qword [rbp+180h]
lea rsp,[rbp+190h]
+ POP_XMM
pop rbp
ret
@@ -787,6 +789,7 @@
mov rax,rsp
push rbx
push rdi
+ PUSH_XMM 16
sub rsp,0C8h
mov r10,qword [rax + 30h] ; pTC
pxor xmm1,xmm1
@@ -841,7 +844,7 @@
punpckhbw xmm2,xmm1
punpcklbw xmm14,xmm1
movd xmm0,eax
- movsx eax,word [rsp + 0C8h + 38h] ; iBeta
+ movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
punpckhbw xmm13,xmm1
punpckhbw xmm15,xmm1
movdqa xmm3,xmm9
@@ -937,6 +940,7 @@
movq [rdi],xmm14
movaps xmm14,[rsp+30h]
mov rsp,r11
+ POP_XMM
pop rdi
pop rbx
ret
@@ -947,6 +951,7 @@
DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
+ PUSH_XMM 15
sub rsp,90h
pxor xmm1,xmm1
mov r11,rcx
@@ -983,7 +988,7 @@
punpcklbw xmm9,xmm1
punpckhbw xmm10,xmm1
movd xmm0,eax
- movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
+ movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
punpckhbw xmm13,xmm1
movdqa xmm7,xmm12
punpcklwd xmm0,xmm0
@@ -1089,6 +1094,7 @@
movaps xmm12,[r11-70h]
movaps xmm13,[r11-80h]
mov rsp,r11
+ POP_XMM
pop rbx
ret
@@ -1102,6 +1108,7 @@
mov rax,rsp
mov [rax+20h],rbx
push rdi
+ PUSH_XMM 16
sub rsp,140h
mov rdi,rdx
lea eax,[r8*4]
@@ -1194,7 +1201,7 @@
movd xmm0,eax
movdqa xmm4,xmm12
movdqa xmm8,xmm11
- movsx eax,word [rsp+170h] ; iBeta
+ movsx eax,word [rsp+170h + 160] ; iBeta
punpcklwd xmm0,xmm0
punpcklbw xmm4,xmm1
punpckhbw xmm12,xmm1
@@ -1352,9 +1359,9 @@
mov [rbx+r10*2],eax
mov eax,[rsp+7Ch]
mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
+ lea rsp,[rsp+140h]
+ POP_XMM
+ mov rbx, [rsp+28h]
pop rdi
ret
@@ -1369,6 +1376,7 @@
push rsi
push rdi
push r12
+ PUSH_XMM 16
sub rsp,170h
movsxd rsi,r8d
@@ -1452,7 +1460,7 @@
punpckhdq xmm7,xmm0
movdqa xmm0,xmm1
punpckldq xmm0,xmm5
- mov rax, [rsp+1C8h] ; pTC
+ mov rax, [rsp+1C8h+160] ; pTC
punpckhdq xmm1,xmm5
movdqa xmm9,xmm6
punpckhqdq xmm6,xmm0
@@ -1490,7 +1498,7 @@
punpckhbw xmm9,xmm1
punpckhbw xmm8,xmm1
punpcklwd xmm0,xmm0
- movsx eax,word [rsp+1C0h] ; iBeta
+ movsx eax,word [rsp+1C0h+160] ; iBeta
mov word [rsp+4],r8w
mov word [rsp+2],r9w
pshufd xmm12,xmm0,0
@@ -1634,6 +1642,7 @@
mov [r10+rbp],eax
lea r11,[rsp+170h]
mov rsp,r11
+ POP_XMM
pop r12
pop rdi
pop rsi
@@ -5184,6 +5193,7 @@
%assign push_num 3
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -5240,6 +5250,7 @@
movdqa [r2 + 70h], xmm0
mov r7, r5
+ POP_XMM
pop r5
pop r4
pop r3
@@ -5262,6 +5273,7 @@
%assign push_num 2
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -5319,6 +5331,7 @@
mov r7, r4
+ POP_XMM
pop r4
pop r3
ret
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -375,6 +375,7 @@
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
@@ -476,6 +477,7 @@
; for left & right border expanding
exp_cross_sse2 32,a
+ POP_XMM
LOAD_4_PARA_POP
pop r6
@@ -502,6 +504,7 @@
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
@@ -602,6 +605,7 @@
; for left & right border expanding
exp_cross_sse2 16,a
+ POP_XMM
LOAD_4_PARA_POP
pop r6
@@ -627,6 +631,7 @@
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
@@ -727,6 +732,7 @@
; for left & right border expanding
exp_cross_sse2 16,u
+ POP_XMM
LOAD_4_PARA_POP
pop r6
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -74,6 +74,7 @@
push r5
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@@ -119,6 +120,7 @@
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
+ POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
@@ -137,6 +139,7 @@
push r5
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@@ -182,6 +185,7 @@
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
+ POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
@@ -200,6 +204,7 @@
push r5
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@@ -223,6 +228,7 @@
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
+ POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -153,6 +153,7 @@
McChromaWidthEq8_sse2:
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
@@ -212,6 +213,7 @@
dec r5
jnz near .xloop
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -232,6 +234,7 @@
McChromaWidthEq8_ssse3:
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
@@ -288,6 +291,7 @@
sub r5, 2
jnz .hloop_chroma
+ POP_XMM
LOAD_6_PARA_POP
ret
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -178,6 +178,7 @@
McHorVer22Width8HorFirst_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -214,6 +215,7 @@
add r2, r3
dec r4
jnz .yloop_width_8
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -229,6 +231,7 @@
McHorVer20WidthEq8_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -269,6 +272,7 @@
dec r4
jnz near .y_loop
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -284,6 +288,7 @@
McHorVer20WidthEq16_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -350,6 +355,7 @@
dec r4
jnz near .y_loop
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -365,6 +371,7 @@
McHorVer02WidthEq8_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -434,6 +441,7 @@
jmp near .start
.xx_exit:
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -462,6 +470,7 @@
McHorVer02Height9Or17_sse2:
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -579,6 +588,7 @@
pop r13
pop r12
%endif
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -596,6 +606,7 @@
McHorVer20Width9Or17_sse2:
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -656,6 +667,7 @@
add r2, r3
dec r5
jnz .yloop_width_9
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -737,6 +749,7 @@
add r2, r3
dec r5
jnz .yloop_width_17
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -754,6 +767,7 @@
McHorVer22HorFirst_sse2:
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -810,6 +824,7 @@
add r2, r3
dec r5
jnz .yloop_width_9
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -884,6 +899,7 @@
add r2, r3
dec r5
jnz .yloop_width_17
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -921,6 +937,7 @@
McHorVer22Width8VerLastAlign_sse2:
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -1034,6 +1051,7 @@
pop r13
pop r12
%endif
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -1050,6 +1068,7 @@
McHorVer22Width8VerLastUnAlign_sse2:
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -1162,5 +1181,6 @@
pop r13
pop r12
%endif
+ POP_XMM
LOAD_6_PARA_POP
ret
--- a/codec/common/satd_sad.asm
+++ b/codec/common/satd_sad.asm
@@ -160,6 +160,7 @@
WelsSampleSatd4x4_sse2:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movd xmm0, [r0]
@@ -221,6 +222,7 @@
movd retrd, xmm6
and retrd, 0xffff
shr retrd, 1
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -234,6 +236,7 @@
WelsSampleSatd8x8_sse2:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -242,6 +245,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -255,6 +259,7 @@
WelsSampleSatd8x16_sse2:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -268,6 +273,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -281,6 +287,7 @@
WelsSampleSatd16x8_sse2:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@@ -299,6 +306,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -312,6 +320,7 @@
WelsSampleSatd16x16_sse2:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@@ -338,6 +347,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -990,6 +1000,7 @@
WelsSampleSatd4x4_sse41:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm4,[HSwapSumSubDB1]
@@ -1031,6 +1042,7 @@
pabsw xmm2,xmm2
pmaxsw xmm0,xmm2
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -1048,6 +1060,7 @@
%endif
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [HSumSubDB1]
@@ -1059,6 +1072,7 @@
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@@ -1081,6 +1095,7 @@
%endif
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [HSumSubDB1]
@@ -1096,6 +1111,7 @@
cmp r6, 4
jl loop_get_satd_8x16
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
@@ -1118,6 +1134,7 @@
%endif
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@@ -1141,6 +1158,7 @@
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@@ -1164,6 +1182,7 @@
%endif
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
@@ -1196,6 +1215,7 @@
cmp r6, 4
jl loop_get_satd_16x16_right
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
@@ -1285,6 +1305,7 @@
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
lea r4, [3*r1]
@@ -1304,6 +1325,7 @@
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@@ -1349,6 +1371,7 @@
WelsSampleSad8x16_sse2:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -1367,6 +1390,7 @@
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -1390,6 +1414,7 @@
push r5
%endif
%assign push_num 3
+ PUSH_XMM 8
mov r0, arg1
mov r1, arg2
SIGN_EXTENSION r1, r1d
@@ -1482,6 +1507,7 @@
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
+ POP_XMM
%ifdef X86_32
pop r5
pop r4
@@ -1494,6 +1520,7 @@
pop r2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -1504,6 +1531,7 @@
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
+ POP_XMM
LOAD_4_PARA_POP
.return:
ret
@@ -1539,6 +1567,7 @@
WelsSampleSadFour16x16_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -1649,6 +1678,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -1657,6 +1687,7 @@
WelsSampleSadFour16x8_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -1735,6 +1766,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -1742,6 +1774,7 @@
WelsSampleSadFour8x16_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -1946,6 +1979,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -1954,6 +1988,7 @@
WelsSampleSadFour8x8_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -2067,6 +2102,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -161,6 +161,7 @@
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1,r1d
%ifdef X86_32
@@ -244,6 +245,7 @@
pop r4
pop r3
%endif
+ POP_XMM
ret
@@ -256,6 +258,7 @@
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1,r1d
%ifdef X86_32
@@ -339,6 +342,7 @@
pop r4
pop r3
%endif
+ POP_XMM
ret
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -58,6 +58,7 @@
WelsResBlockZero16x16_sse2:
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
lea r1, [r1*2]
lea r2, [r1*3]
@@ -119,6 +120,7 @@
movdqa [r0+r2], xmm7
movdqa [r0+r2+10h], xmm7
+ POP_XMM
ret
@@ -131,6 +133,7 @@
WelsResBlockZero8x8_sse2:
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
lea r1, [r1*2]
lea r2, [r1*3]
@@ -149,5 +152,6 @@
movdqa [r0+r2], xmm7
+ POP_XMM
ret
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -227,6 +227,7 @@
push r4
%assign push_num 2
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r4, r0 ; save r0 in r4
sub r0, 1
@@ -306,6 +307,7 @@
cmp r2, 16
jnz get_i16x16_luma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
ret
@@ -394,6 +396,7 @@
push r4
%assign push_num 2
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r4, r0
sub r0, 1
@@ -472,6 +475,7 @@
cmp r2, 8
jnz get_i_chroma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
WELSEMMS
@@ -1209,6 +1213,7 @@
WelsDecoderI16x16LumaPredDcTop_sse2:
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r2, r0
sub r2, r1
@@ -1271,6 +1276,7 @@
movdqa [r0+2*r1], xmm0
movdqa [r0+r2], xmm1
+ POP_XMM
ret
ALIGN 16
@@ -1389,6 +1395,7 @@
WelsDecoderIChromaPredDcTop_sse2:
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r2, r0
sub r2, r1
@@ -1418,6 +1425,7 @@
movq [r0+r1], xmm0
movq [r0+2*r1], xmm0
movq [r0+r2], xmm0
+ POP_XMM
ret
ALIGN 16
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -295,6 +295,7 @@
WelsDctFourT4_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
@@ -332,6 +333,7 @@
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -344,6 +346,7 @@
WelsIDctFourT4Rec_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
;Load 4x8
@@ -383,6 +386,7 @@
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
+ POP_XMM
LOAD_5_PARA_POP
; pop esi
; pop ebx
@@ -403,6 +407,7 @@
WelsIDctRecI16x16Dc_sse2:
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
@@ -439,6 +444,7 @@
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -478,6 +484,7 @@
WelsHadamardT4Dc_sse2:
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
@@ -503,4 +510,5 @@
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
+ POP_XMM
ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -238,6 +238,7 @@
push r4
%assign push_num 2
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2, r2d
sub r1, 1
sub r1, r2
@@ -313,6 +314,7 @@
inc r3
cmp r3, 16
jnz get_i16x16_luma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
ret
@@ -396,6 +398,7 @@
push r4
%assign push_num 2
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2, r2d
sub r1, 1
sub r1, r2
@@ -470,6 +473,7 @@
inc r3
cmp r3, 8
jnz get_i_chroma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
WELSEMMS
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -144,6 +144,7 @@
WelsQuantFour4x4Max_sse2:
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
@@ -169,6 +170,7 @@
pmaxsw xmm0, xmm1
movq [r3], xmm0
+ POP_XMM
LOAD_4_PARA_POP
ret
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -178,6 +178,7 @@
push r3
%assign push_num 1
LOAD_2_PARA
+ PUSH_XMM 8
pxor xmm7, xmm7
@@ -214,6 +215,7 @@
movq [r3], xmm5
+ POP_XMM
pop r3
%assign push_num 0
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -468,6 +468,7 @@
push r15
%assign push_num 4
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r3,r3d
@@ -537,6 +538,7 @@
sub r1, r0
mov [r4+2], r1w ; to store uiTextureIndex
+ POP_XMM
LOAD_5_PARA_POP
pop r15
pop r14
@@ -570,6 +572,7 @@
push r13
%assign push_num 2
LOAD_7_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2,r2d
SIGN_EXTENSION r3,r3d
SIGN_EXTENSION r4,r4d
@@ -637,6 +640,7 @@
%undef psadframe
%undef psad8x8
%undef pushsize
+ POP_XMM
LOAD_7_PARA_POP
pop r13
pop r12
@@ -807,6 +811,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 8
%ifdef WIN64
mov r4, arg5 ;iPicStride
@@ -902,6 +907,7 @@
paddd xmm7, xmm5
movd [r15], xmm7
+ POP_XMM
pop r15
pop r14
pop r13
@@ -1108,6 +1114,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
@@ -1218,6 +1225,7 @@
mov r13, psadframe
movd [r13], xmm8
+ POP_XMM
pop r15
pop r14
pop r13
@@ -1680,6 +1688,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
; mov r5,arg6
@@ -1805,6 +1814,7 @@
mov r13, psadframe
movd [r13], xmm8
+ POP_XMM
pop r15
pop r14
pop r13
@@ -1855,6 +1865,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
;mov r5,arg6
@@ -2027,6 +2038,7 @@
mov r14, psadframe
movd [r14], xmm8
+ POP_XMM
pop r15
pop r14
pop r13