shithub: openh264

Download patch

ref: 3cf52554f771fcdf247904fba9714270c7f9e6c1
parent: 918b211990ec2ab891cac748aa3561d5b0db74f8
author: Martin Storsjö <[email protected]>
date: Fri Mar 14 06:29:53 EDT 2014

Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64

According to the Win64 ABI, these registers need to be preserved,
and compilers are allowed to rely on their content to stay
available - not only for float usage but for any usage, anywhere,
in the calling C++ code.

This adds a macro which pushes the clobbered registers onto the
stack if targeting win64 (and a matching one which restores them).
The parameter to the macro is the number of xmm registers used
(e.g. if using xmm0 - xmm7, the parameter is 8), or in other
words, the number of the highest xmm register used plus one.

This is similar to how the same issue is handled for the NEON
registers q4-q7 with the vpush instruction, except that they needed
to be preserved on all platforms, not only on one particular platform.

This allows removing the XMMREG_PROTECT_* hacks, which can
easily fail if the compiler chooses to use the callee saved
xmm registers in an unexpected spot.

--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -335,6 +335,82 @@
     %endif
 %endmacro
 
+%macro PUSH_XMM 1
+    %ifdef WIN64
+        %assign xmm_num_regs %1
+        %if xmm_num_regs > 6
+            %ifdef push_num
+                %assign push_num push_num+2*(%1-6)
+            %endif
+            sub rsp, 16*(%1 - 6)
+            movdqu [rsp], xmm6
+        %endif
+        %if xmm_num_regs > 7
+            movdqu [rsp+16], xmm7
+        %endif
+        %if xmm_num_regs > 8
+            movdqu [rsp+32], xmm8
+        %endif
+        %if xmm_num_regs > 9
+            movdqu [rsp+48], xmm9
+        %endif
+        %if xmm_num_regs > 10
+            movdqu [rsp+64], xmm10
+        %endif
+        %if xmm_num_regs > 11
+            movdqu [rsp+80], xmm11
+        %endif
+        %if xmm_num_regs > 12
+            movdqu [rsp+96], xmm12
+        %endif
+        %if xmm_num_regs > 13
+            movdqu [rsp+112], xmm13
+        %endif
+        %if xmm_num_regs > 14
+            movdqu [rsp+128], xmm14
+        %endif
+        %if xmm_num_regs > 15
+            movdqu [rsp+144], xmm15
+        %endif
+    %endif
+%endmacro
+
+%macro POP_XMM 0
+    %ifdef WIN64
+        %if xmm_num_regs > 15
+            movdqu xmm15, [rsp+144]
+        %endif
+        %if xmm_num_regs > 14
+            movdqu xmm14, [rsp+128]
+        %endif
+        %if xmm_num_regs > 13
+            movdqu xmm13, [rsp+112]
+        %endif
+        %if xmm_num_regs > 12
+            movdqu xmm12, [rsp+96]
+        %endif
+        %if xmm_num_regs > 11
+            movdqu xmm11, [rsp+80]
+        %endif
+        %if xmm_num_regs > 10
+            movdqu xmm10, [rsp+64]
+        %endif
+        %if xmm_num_regs > 9
+            movdqu xmm9, [rsp+48]
+        %endif
+        %if xmm_num_regs > 8
+            movdqu xmm8, [rsp+32]
+        %endif
+        %if xmm_num_regs > 7
+            movdqu xmm7, [rsp+16]
+        %endif
+        %if xmm_num_regs > 6
+            movdqu xmm6, [rsp]
+            add rsp, 16*(xmm_num_regs - 6)
+        %endif
+    %endif
+%endmacro
+
 %macro SIGN_EXTENSION 2
     %ifndef X86_32
             movsx %1, %2
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -65,6 +65,7 @@
 DeblockLumaLt4V_ssse3:
   push        rbp
   mov         r11,[rsp + 16 + 20h]  ; pTC
+  PUSH_XMM 16
   sub         rsp,1B0h
   lea         rbp,[rsp+20h]
   movd        xmm4,r8d
@@ -313,6 +314,7 @@
   movdqa      [r12+rcx],xmm0
   mov         r12,qword [rbp+180h]
   lea         rsp,[rbp+190h]
+  POP_XMM
   pop         rbp
   ret
 
@@ -787,6 +789,7 @@
   mov         rax,rsp
   push        rbx
   push        rdi
+  PUSH_XMM 16
   sub         rsp,0C8h
   mov         r10,qword [rax + 30h]  ; pTC
   pxor        xmm1,xmm1
@@ -841,7 +844,7 @@
   punpckhbw   xmm2,xmm1
   punpcklbw   xmm14,xmm1
   movd        xmm0,eax
-  movsx       eax,word [rsp + 0C8h + 38h] ; iBeta
+  movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
   punpckhbw   xmm13,xmm1
   punpckhbw   xmm15,xmm1
   movdqa      xmm3,xmm9
@@ -937,6 +940,7 @@
   movq        [rdi],xmm14
   movaps      xmm14,[rsp+30h]
   mov         rsp,r11
+  POP_XMM
   pop         rdi
   pop         rbx
   ret
@@ -947,6 +951,7 @@
 DeblockChromaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
+  PUSH_XMM 15
   sub         rsp,90h
   pxor        xmm1,xmm1
   mov         r11,rcx
@@ -983,7 +988,7 @@
   punpcklbw   xmm9,xmm1
   punpckhbw   xmm10,xmm1
   movd        xmm0,eax
-  movsx       eax,word [rsp + 90h + 8h + 28h]   ; iBeta
+  movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
   punpckhbw   xmm13,xmm1
   movdqa      xmm7,xmm12
   punpcklwd   xmm0,xmm0
@@ -1089,6 +1094,7 @@
   movaps      xmm12,[r11-70h]
   movaps      xmm13,[r11-80h]
   mov         rsp,r11
+  POP_XMM
   pop         rbx
   ret
 
@@ -1102,6 +1108,7 @@
   mov         rax,rsp
   mov         [rax+20h],rbx
   push        rdi
+  PUSH_XMM 16
   sub         rsp,140h
   mov         rdi,rdx
   lea         eax,[r8*4]
@@ -1194,7 +1201,7 @@
   movd        xmm0,eax
   movdqa      xmm4,xmm12
   movdqa      xmm8,xmm11
-  movsx       eax,word [rsp+170h] ; iBeta
+  movsx       eax,word [rsp+170h + 160] ; iBeta
   punpcklwd   xmm0,xmm0
   punpcklbw   xmm4,xmm1
   punpckhbw   xmm12,xmm1
@@ -1352,9 +1359,9 @@
   mov         [rbx+r10*2],eax
   mov         eax,[rsp+7Ch]
   mov         [rdx+rbx],eax
-  lea         r11,[rsp+140h]
-  mov         rbx, [r11+28h]
-  mov         rsp,r11
+  lea         rsp,[rsp+140h]
+  POP_XMM
+  mov         rbx, [rsp+28h]
   pop         rdi
   ret
 
@@ -1369,6 +1376,7 @@
   push        rsi
   push        rdi
   push        r12
+  PUSH_XMM 16
   sub         rsp,170h
 
   movsxd      rsi,r8d
@@ -1452,7 +1460,7 @@
   punpckhdq   xmm7,xmm0
   movdqa      xmm0,xmm1
   punpckldq   xmm0,xmm5
-  mov         rax, [rsp+1C8h]    ; pTC
+  mov         rax, [rsp+1C8h+160]    ; pTC
   punpckhdq   xmm1,xmm5
   movdqa      xmm9,xmm6
   punpckhqdq  xmm6,xmm0
@@ -1490,7 +1498,7 @@
   punpckhbw   xmm9,xmm1
   punpckhbw   xmm8,xmm1
   punpcklwd   xmm0,xmm0
-  movsx       eax,word [rsp+1C0h]   ; iBeta
+  movsx       eax,word [rsp+1C0h+160]   ; iBeta
   mov         word [rsp+4],r8w
   mov         word [rsp+2],r9w
   pshufd      xmm12,xmm0,0
@@ -1634,6 +1642,7 @@
   mov         [r10+rbp],eax
   lea         r11,[rsp+170h]
   mov         rsp,r11
+  POP_XMM
   pop         r12
   pop         rdi
   pop         rsi
@@ -5184,6 +5193,7 @@
 
 %assign   push_num   3
     LOAD_3_PARA
+    PUSH_XMM 8
 
     SIGN_EXTENSION   r1, r1d
 
@@ -5240,6 +5250,7 @@
     movdqa  [r2 + 70h],  xmm0
 
     mov     r7,   r5
+    POP_XMM
     pop     r5
     pop     r4
     pop     r3
@@ -5262,6 +5273,7 @@
 
 %assign  push_num 2
     LOAD_3_PARA
+    PUSH_XMM 8
 
     SIGN_EXTENSION   r1, r1d
 
@@ -5319,6 +5331,7 @@
 
 
     mov      r7,   r4
+    POP_XMM
     pop      r4
     pop      r3
     ret
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -375,6 +375,7 @@
 
     %assign push_num 3
     LOAD_4_PARA
+    PUSH_XMM 7
 
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r2, r2d
@@ -476,6 +477,7 @@
     ; for left & right border expanding
     exp_cross_sse2 32,a
 
+    POP_XMM
     LOAD_4_PARA_POP
 
     pop r6
@@ -502,6 +504,7 @@
 
     %assign push_num 3
     LOAD_4_PARA
+    PUSH_XMM 7
 
     SIGN_EXTENSION r1,r1d
     SIGN_EXTENSION r2,r2d
@@ -602,6 +605,7 @@
     ; for left & right border expanding
     exp_cross_sse2 16,a
 
+    POP_XMM
     LOAD_4_PARA_POP
 
     pop r6
@@ -627,6 +631,7 @@
 
     %assign push_num 3
     LOAD_4_PARA
+    PUSH_XMM 7
 
     SIGN_EXTENSION r1,r1d
     SIGN_EXTENSION r2,r2d
@@ -727,6 +732,7 @@
     ; for left & right border expanding
     exp_cross_sse2 16,u
 
+    POP_XMM
     LOAD_4_PARA_POP
 
     pop r6
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -74,6 +74,7 @@
 	push r5
 	%assign  push_num 2
     LOAD_4_PARA
+    PUSH_XMM 8
 
 	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
 	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
@@ -119,6 +120,7 @@
 	movdqa [r0+r1], xmm5
 	movdqa [r0+2*r1], xmm6
 	movdqa [r0+r4], xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	pop r5
 	pop r4
@@ -137,6 +139,7 @@
 	push r5
 	%assign  push_num 2
     LOAD_4_PARA
+    PUSH_XMM 8
 
 	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
 	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
@@ -182,6 +185,7 @@
 	movdqa [r0+r1], xmm5
 	movdqa [r0+2*r1], xmm6
 	movdqa [r0+r4], xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	pop r5
 	pop r4
@@ -200,6 +204,7 @@
 	push r5
 	%assign  push_num 2
     LOAD_4_PARA
+    PUSH_XMM 8
 
 	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
 	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
@@ -223,6 +228,7 @@
 	movdqa [r0+r1], xmm5
 	movdqa [r0+2*r1], xmm6
 	movdqa [r0+r4], xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	pop r5
 	pop r4
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -153,6 +153,7 @@
 McChromaWidthEq8_sse2:
 	%assign  push_num 0
 	LOAD_6_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r5, r5d
@@ -212,6 +213,7 @@
 	dec r5
 	jnz near .xloop
 
+	POP_XMM
 	LOAD_6_PARA_POP
 
 	ret
@@ -232,6 +234,7 @@
 McChromaWidthEq8_ssse3:
 	%assign  push_num 0
 	LOAD_6_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r5, r5d
@@ -288,6 +291,7 @@
 	sub r5, 2
 	jnz .hloop_chroma
 
+	POP_XMM
 	LOAD_6_PARA_POP
 
 	ret
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -178,6 +178,7 @@
 McHorVer22Width8HorFirst_sse2:
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -214,6 +215,7 @@
 	add r2, r3
 	dec r4
 	jnz .yloop_width_8
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -229,6 +231,7 @@
 McHorVer20WidthEq8_sse2:
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -269,6 +272,7 @@
 	dec r4
 	jnz near .y_loop
 
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -284,6 +288,7 @@
 McHorVer20WidthEq16_sse2:
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -350,6 +355,7 @@
 	dec r4
 	jnz near .y_loop
 
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -365,6 +371,7 @@
 McHorVer02WidthEq8_sse2:
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -434,6 +441,7 @@
 	jmp near .start
 
 .xx_exit:
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -462,6 +470,7 @@
 McHorVer02Height9Or17_sse2:
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -579,6 +588,7 @@
 	pop r13
 	pop r12
 %endif
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -596,6 +606,7 @@
 McHorVer20Width9Or17_sse2:
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -656,6 +667,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_9
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -737,6 +749,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_17
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -754,6 +767,7 @@
 McHorVer22HorFirst_sse2:
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -810,6 +824,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_9
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -884,6 +899,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_17
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -921,6 +937,7 @@
  McHorVer22Width8VerLastAlign_sse2:
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -1034,6 +1051,7 @@
 	pop r13
 	pop r12
 %endif
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -1050,6 +1068,7 @@
  McHorVer22Width8VerLastUnAlign_sse2:
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -1162,5 +1181,6 @@
 	pop r13
 	pop r12
 %endif
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
--- a/codec/common/satd_sad.asm
+++ b/codec/common/satd_sad.asm
@@ -160,6 +160,7 @@
 WelsSampleSatd4x4_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
     movd      xmm0, [r0]
@@ -221,6 +222,7 @@
 	movd           retrd,  xmm6
     and            retrd,  0xffff
     shr            retrd,  1
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -234,6 +236,7 @@
  WelsSampleSatd8x8_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm6,   xmm6
@@ -242,6 +245,7 @@
     psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	movd    retrd,   xmm6
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -255,6 +259,7 @@
  WelsSampleSatd8x16_sse2:
 	 %assign  push_num 0
 	 LOAD_4_PARA
+	 PUSH_XMM 8
 	 SIGN_EXTENSION r1, r1d
 	 SIGN_EXTENSION r3, r3d
 	 pxor   xmm6,   xmm6
@@ -268,6 +273,7 @@
 	 psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    retrd,   xmm6
+	 POP_XMM
 	 LOAD_4_PARA_POP
 	 ret
 
@@ -281,6 +287,7 @@
 WelsSampleSatd16x8_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	push r0
@@ -299,6 +306,7 @@
 	psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	movd    retrd,   xmm6
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -312,6 +320,7 @@
 WelsSampleSatd16x16_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	push r0
@@ -338,6 +347,7 @@
     psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	movd    retrd,   xmm6
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -990,6 +1000,7 @@
 WelsSampleSatd4x4_sse41:
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	movdqa      xmm4,[HSwapSumSubDB1]
@@ -1031,6 +1042,7 @@
 	pabsw       xmm2,xmm2
 	pmaxsw      xmm0,xmm2
 	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -1048,6 +1060,7 @@
 %endif
 	%assign  push_num 2
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	movdqa      xmm7, [HSumSubDB1]
@@ -1059,6 +1072,7 @@
 	lea			r2,  [r2+4*r3]
 	SSE41_GetSatd8x4
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r5
@@ -1081,6 +1095,7 @@
 %endif
 	%assign  push_num 3
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	movdqa      xmm7, [HSumSubDB1]
@@ -1096,6 +1111,7 @@
 	cmp         r6,  4
 	jl          loop_get_satd_8x16
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r6
@@ -1118,6 +1134,7 @@
 %endif
 	%assign  push_num 2
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	push  r0
@@ -1141,6 +1158,7 @@
 	lea			r2,  [r2+4*r3]
 	SSE41_GetSatd8x4
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r5
@@ -1164,6 +1182,7 @@
 %endif
 	%assign  push_num 3
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 
@@ -1196,6 +1215,7 @@
 	cmp         r6,  4
 	jl          loop_get_satd_16x16_right
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r6
@@ -1285,6 +1305,7 @@
 
 	%assign  push_num 2
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	lea r4, [3*r1]
@@ -1304,6 +1325,7 @@
 	movhlps xmm0, xmm7
 	paddw xmm0, xmm7
 	movd retrd, xmm0
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r5
@@ -1349,6 +1371,7 @@
 WelsSampleSad8x16_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 7
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
     pxor   xmm6,   xmm6
@@ -1367,6 +1390,7 @@
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       retrd,  xmm0
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -1390,6 +1414,7 @@
 	push	r5
 %endif
 	%assign  push_num 3
+	PUSH_XMM 8
 	mov		r0,  arg1
 	mov		r1,  arg2
 	SIGN_EXTENSION r1, r1d
@@ -1482,6 +1507,7 @@
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       retrd,  xmm0
+	POP_XMM
 %ifdef X86_32
 	pop	 r5
 	pop	 r4
@@ -1494,6 +1520,7 @@
 	pop r2
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 7
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm6,   xmm6
@@ -1504,6 +1531,7 @@
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       retrd,  xmm0
+	POP_XMM
 	LOAD_4_PARA_POP
 .return:
 	ret
@@ -1539,6 +1567,7 @@
 WelsSampleSadFour16x16_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -1649,6 +1678,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -1657,6 +1687,7 @@
 WelsSampleSadFour16x8_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -1735,6 +1766,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -1742,6 +1774,7 @@
 WelsSampleSadFour8x16_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -1946,6 +1979,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -1954,6 +1988,7 @@
 WelsSampleSadFour8x8_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -2067,6 +2102,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -161,6 +161,7 @@
 
     %assign push_num 0
     LOAD_2_PARA
+    PUSH_XMM 8
     SIGN_EXTENSION r1,r1d
 
 %ifdef X86_32
@@ -244,6 +245,7 @@
 	pop r4
 	pop r3
 %endif
+	POP_XMM
 
 	ret
 
@@ -256,6 +258,7 @@
 
     %assign push_num 0
     LOAD_2_PARA
+    PUSH_XMM 8
     SIGN_EXTENSION r1,r1d
 
 %ifdef X86_32
@@ -339,6 +342,7 @@
 	pop r4
 	pop r3
 %endif
+	POP_XMM
 
 	ret
 
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -58,6 +58,7 @@
 WelsResBlockZero16x16_sse2:
         %assign push_num 0
         LOAD_2_PARA
+        PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
         lea 	r1, 	[r1*2]
         lea 	r2,	[r1*3]
@@ -119,6 +120,7 @@
 	movdqa   [r0+r2],     xmm7
 	movdqa   [r0+r2+10h],     xmm7
 
+	POP_XMM
 	ret
 
 
@@ -131,6 +133,7 @@
 WelsResBlockZero8x8_sse2:
 	  %assign push_num 0
           LOAD_2_PARA
+          PUSH_XMM 8
 	  SIGN_EXTENSION r1, r1d
 	  lea       r1,     [r1*2]
 	  lea       r2,     [r1*3]
@@ -149,5 +152,6 @@
 	  movdqa    [r0+r2],     xmm7
 
 
+	  POP_XMM
 	  ret
 
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -227,6 +227,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_2_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r1, r1d
 		mov r4, r0 ; save r0 in r4
 		sub		r0,	1
@@ -306,6 +307,7 @@
 		cmp		r2,	16
 		jnz get_i16x16_luma_pred_plane_sse2_1
 
+		POP_XMM
 		pop r4
 		pop r3
 		ret
@@ -394,6 +396,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_2_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r1, r1d
 		mov r4, r0
 		sub		r0,	1
@@ -472,6 +475,7 @@
 		cmp		r2,	8
 		jnz get_i_chroma_pred_plane_sse2_1
 
+		POP_XMM
 		pop r4
 		pop r3
 		WELSEMMS
@@ -1209,6 +1213,7 @@
 WelsDecoderI16x16LumaPredDcTop_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	sub r2, r1
@@ -1271,6 +1276,7 @@
 	movdqa [r0+2*r1], xmm0
 	movdqa [r0+r2], xmm1
 
+	POP_XMM
 	ret
 
 ALIGN 16
@@ -1389,6 +1395,7 @@
 WelsDecoderIChromaPredDcTop_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	sub r2, r1
@@ -1418,6 +1425,7 @@
 	movq [r0+r1], xmm0
 	movq [r0+2*r1], xmm0
 	movq [r0+r2], xmm0
+	POP_XMM
 	ret
 
 ALIGN 16
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -295,6 +295,7 @@
 WelsDctFourT4_sse2:
     %assign push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
     pxor    xmm7, xmm7
@@ -332,6 +333,7 @@
 	lea		r0, [r0+64]
 	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
+	POP_XMM
 	LOAD_5_PARA_POP
     ret
 
@@ -344,6 +346,7 @@
 WelsIDctFourT4Rec_sse2:
 	%assign push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	;Load 4x8
@@ -383,6 +386,7 @@
 	lea		r2, [r2 + 2 * r3]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
 	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],	[r2 + r3]
+	POP_XMM
 	LOAD_5_PARA_POP
    ; pop		esi
    ; pop		ebx
@@ -403,6 +407,7 @@
 WelsIDctRecI16x16Dc_sse2:
 	%assign push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor		xmm7,		xmm7
@@ -439,6 +444,7 @@
 	lea			r0,		[r0 + 2 * r1]
 	lea			r2,		[r2 + 2 * r3]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+	POP_XMM
 	LOAD_5_PARA_POP
     ret
 
@@ -478,6 +484,7 @@
 WelsHadamardT4Dc_sse2:
 		%assign push_num 0
 		LOAD_2_PARA
+		PUSH_XMM 8
 		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
 		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, r1 + 0x40
 		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, r1 + 0x100
@@ -503,4 +510,5 @@
 		movdqa	[r0+ 0],   xmm3
 		movdqa	[r0+16],   xmm2
 
+		POP_XMM
 		ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -238,6 +238,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_3_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r2, r2d
 		sub		r1,	1
 		sub		r1,	r2
@@ -313,6 +314,7 @@
 		inc		r3
 		cmp		r3,	16
 		jnz get_i16x16_luma_pred_plane_sse2_1
+		POP_XMM
 		pop r4
 		pop r3
 		ret
@@ -396,6 +398,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_3_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r2, r2d
 		sub		r1,	1
 		sub		r1,	r2
@@ -470,6 +473,7 @@
 		inc		r3
 		cmp		r3,	8
 		jnz get_i_chroma_pred_plane_sse2_1
+		POP_XMM
 		pop r4
 		pop r3
 		WELSEMMS
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -144,6 +144,7 @@
 WelsQuantFour4x4Max_sse2:
 		%assign push_num 0
 		LOAD_4_PARA
+		PUSH_XMM 8
 		MOVDQ	xmm2, [r1]
 		MOVDQ	xmm3, [r2]
 
@@ -169,6 +170,7 @@
 		pmaxsw	xmm0, xmm1
 
 		movq	[r3], xmm0
+		POP_XMM
 		LOAD_4_PARA_POP
 		ret
 
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -178,6 +178,7 @@
         push r3
         %assign push_num 1
         LOAD_2_PARA
+        PUSH_XMM 8
 
 		pxor		xmm7,	xmm7
 
@@ -214,6 +215,7 @@
 		movq		[r3],	xmm5
 
 
+		POP_XMM
 		pop r3
 		%assign push_num 0
 
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -468,6 +468,7 @@
   push r15
   %assign push_num 4
   LOAD_5_PARA
+  PUSH_XMM 8
   SIGN_EXTENSION r1,r1d
   SIGN_EXTENSION r3,r3d
 
@@ -537,6 +538,7 @@
   sub r1, r0
   mov [r4+2], r1w                               ; to store uiTextureIndex
 
+  POP_XMM
   LOAD_5_PARA_POP
   pop r15
   pop r14
@@ -570,6 +572,7 @@
   push r13
   %assign push_num 2
   LOAD_7_PARA
+  PUSH_XMM 8
   SIGN_EXTENSION r2,r2d
   SIGN_EXTENSION r3,r3d
   SIGN_EXTENSION r4,r4d
@@ -637,6 +640,7 @@
 %undef          psadframe
 %undef          psad8x8
 %undef          pushsize
+  POP_XMM
   LOAD_7_PARA_POP
   pop r13
   pop r12
@@ -807,6 +811,7 @@
   push r14
   push r15
   %assign push_num 4
+  PUSH_XMM 8
 
 %ifdef WIN64
   mov r4, arg5  ;iPicStride
@@ -902,6 +907,7 @@
   paddd   xmm7,   xmm5
   movd    [r15],  xmm7
 
+  POP_XMM
   pop r15
   pop r14
   pop r13
@@ -1108,6 +1114,7 @@
   push r14
   push r15
   %assign push_num 4
+  PUSH_XMM 10
 
 %ifdef WIN64
   mov r4,arg5
@@ -1218,6 +1225,7 @@
   mov             r13,    psadframe
   movd    [r13],  xmm8
 
+  POP_XMM
   pop r15
   pop r14
   pop r13
@@ -1680,6 +1688,7 @@
   push r14
   push r15
 %assign push_num 4
+  PUSH_XMM 10
 %ifdef WIN64
   mov r4,arg5
   ;  mov r5,arg6
@@ -1805,6 +1814,7 @@
   mov             r13,    psadframe
   movd    [r13],  xmm8
 
+  POP_XMM
   pop r15
   pop r14
   pop r13
@@ -1855,6 +1865,7 @@
   push r14
   push r15
 %assign push_num 4
+  PUSH_XMM 10
 %ifdef WIN64
   mov r4,arg5
   ;mov r5,arg6
@@ -2027,6 +2038,7 @@
   mov             r14,    psadframe
   movd    [r14],  xmm8
 
+  POP_XMM
   pop r15
   pop r14
   pop r13