shithub: openh264

Download patch

ref: da0f65ea0af5e72d003d457756e1d76978d0ba57
parent: 63100178772d4bb06c6ca0e841e1e7f7d315704d
parent: eaf7d65518b485df226b24ebb3e38a5d13fe9926
author: dongzha <[email protected]>
date: Fri Jul 11 04:48:45 EDT 2014

Merge pull request #1140 from zhilwang/x86_64-downsample

Add X86 64bit asm code for downsample

--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -57,21 +57,21 @@
   sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearFastDownsampler_c;
 #if defined(X86_ASM)
   if (iCpuFlag & WELS_CPU_SSE) {
-    /*  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;
-      sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;
-      sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;*/
+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;
+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;
+    sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;
   }
   if (iCpuFlag & WELS_CPU_SSE2) {
-    //  sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
-    //  sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
+    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
+    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
   }
   if (iCpuFlag & WELS_CPU_SSSE3) {
-    //  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;
-    //  sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;
+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;
+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;
   }
   if (iCpuFlag & WELS_CPU_SSE41) {
-    //  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;
-    //  sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;
+    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;
+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;
   }
 #endif//X86_ASM
 
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -94,13 +94,9 @@
 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
 
 void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
-    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
 void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
-    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+    const int32_t kiDstHeight, uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
 WELSVP_EXTERN_C_END
 #endif
 
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -202,31 +202,31 @@
 
 
 #ifdef X86_ASM
-//void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-//    const int32_t kiDstHeight,
-//    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-//  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
-//  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
-//
-//  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
-//  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
-//
-//  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
-//                                       pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-//}
-//
-//void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-//    const int32_t kiDstHeight,
-//    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-//  const int32_t kiScaleBit = 15;
-//  const uint32_t kuiScale = (1 << kiScaleBit);
-//
-//  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
-//  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
-//
-//  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
-//      pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-//}
+void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
+  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
+
+  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth);
+  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight);
+
+  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+                                       pSrc, kiSrcStride, uiScalex, uiScaley);
+}
+
+void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBit = 15;
+  const uint32_t kuiScale = (1 << kiScaleBit);
+
+  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
+  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
+
+  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+      pSrc, kiSrcStride, uiScalex, uiScaley);
+}
 #endif //X86_ASM
 
 #ifdef HAVE_NEON
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -39,7 +39,7 @@
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
-%ifdef X86_32
+
 ;***********************************************************************
 ; Macros and other preprocessor constants
 ;***********************************************************************
@@ -64,6 +64,8 @@
     db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
 shufb_mask_high:
     db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+add_extra_half:
+    dd 16384,0,0,0
 
 
 ;***********************************************************************
@@ -78,28 +80,36 @@
 ;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-    push ebx
-    push edx
-    push esi
-    push edi
-    push ebp
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
 
-    mov edi, [esp+24]   ; pDst
-    mov edx, [esp+28]   ; iDstStride
-    mov esi, [esp+32]   ; pSrc
-    mov ecx, [esp+36]   ; iSrcStride
-    mov ebp, [esp+44]   ; iSrcHeight
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $01            ; iSrcHeight >> 1
 
-    sar ebp, $01            ; iSrcHeight >> 1
-
-.yloops:
-    mov eax, [esp+40]   ; iSrcWidth
-    sar eax, $01            ; iSrcWidth >> 1
-    mov ebx, eax        ; iDstWidth restored at ebx
-    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    neg ebx             ; - (iSrcWidth >> 1)
+.yloops1:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+    sar r4, $01            ; iSrcWidth >> 1
+    mov r6, r4        ; iDstWidth restored at ebx
+    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg r6             ; - (iSrcWidth >> 1)
     ; each loop = source bandwidth: 32 bytes
-.xloops:
+.xloops1:
     ; 1st part horizonal loop: x16 bytes
     ;               mem  hi<-       ->lo
     ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
@@ -108,10 +118,10 @@
     ;: H G F E D C B A, P O N M L K J I
     ;: h g f e d c b a, p o n m l k j i
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movq mm0, [esi]         ; 1st pSrc line
-    movq mm1, [esi+8]       ; 1st pSrc line + 8
-    movq mm2, [esi+ecx]     ; 2nd pSrc line
-    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8
+    movq mm0, [r2]         ; 1st pSrc line
+    movq mm1, [r2+8]       ; 1st pSrc line + 8
+    movq mm2, [r2+r3]     ; 2nd pSrc line
+    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8
 
     ; to handle mm0, mm1, mm2, mm3
     pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
@@ -156,10 +166,10 @@
     ;: H G F E D C B A, P O N M L K J I
     ;: h g f e d c b a, p o n m l k j i
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movq mm1, [esi+16]      ; 1st pSrc line + 16
-    movq mm2, [esi+24]      ; 1st pSrc line + 24
-    movq mm3, [esi+ecx+16]  ; 2nd pSrc line + 16
-    movq mm4, [esi+ecx+24]  ; 2nd pSrc line + 24
+    movq mm1, [r2+16]      ; 1st pSrc line + 16
+    movq mm2, [r2+24]      ; 1st pSrc line + 24
+    movq mm3, [r2+r3+16]  ; 2nd pSrc line + 16
+    movq mm4, [r2+r3+24]  ; 2nd pSrc line + 24
 
     ; to handle mm1, mm2, mm3, mm4
     pshufw mm5, mm1, 0d8h   ; d D b B c C a A ; 11011000 B
@@ -196,31 +206,33 @@
     pavgb mm3, mm7      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
     pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
 
-    movq [edi  ], mm0
-    movq [edi+8], mm2
+    movq [r0  ], mm0
+    movq [r0+8], mm2
 
     ; next SMB
-    lea esi, [esi+32]
-    lea edi, [edi+16]
+    lea r2, [r2+32]
+    lea r0, [r0+16]
 
-    dec eax
-    jg near .xloops
+    dec r4
+    jg near .xloops1
 
     ; next line
-    lea esi, [esi+2*ecx]    ; next end of lines
-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
-    lea edi, [edi+edx]
-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
+    lea r2, [r2+2*r3]    ; next end of lines
+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
 
-    dec ebp
-    jg near .yloops
+    dec r5
+    jg near .yloops1
 
     WELSEMMS
-    pop ebp
-    pop edi
-    pop esi
-    pop edx
-    pop ebx
+%ifndef X86_32
+    pop r12
+%endif
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
     ret
 
 ;***********************************************************************
@@ -229,28 +241,36 @@
 ;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-    push ebx
-    push edx
-    push esi
-    push edi
-    push ebp
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
 
-    mov edi, [esp+24]   ; pDst
-    mov edx, [esp+28]   ; iDstStride
-    mov esi, [esp+32]   ; pSrc
-    mov ecx, [esp+36]   ; iSrcStride
-    mov ebp, [esp+44]   ; iSrcHeight
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $01            ; iSrcHeight >> 1
 
-    sar ebp, $01        ; iSrcHeight >> 1
-
-.yloops:
-    mov eax, [esp+40]   ; iSrcWidth
-    sar eax, $01        ; iSrcWidth >> 1
-    mov ebx, eax        ; iDstWidth restored at ebx
-    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
-    neg ebx         ; - (iSrcWidth >> 1)
+.yloops2:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+    sar r4, $01            ; iSrcWidth >> 1
+    mov r6, r4        ; iDstWidth restored at ebx
+    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
+    neg r6             ; - (iSrcWidth >> 1)
     ; each loop = source bandwidth: 16 bytes
-.xloops:
+.xloops2:
     ; 1st part horizonal loop: x16 bytes
     ;               mem  hi<-       ->lo
     ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
@@ -259,10 +279,10 @@
     ;: H G F E D C B A, P O N M L K J I
     ;: h g f e d c b a, p o n m l k j i
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movq mm0, [esi]         ; 1st pSrc line
-    movq mm1, [esi+8]       ; 1st pSrc line + 8
-    movq mm2, [esi+ecx]     ; 2nd pSrc line
-    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8
+    movq mm0, [r2]         ; 1st pSrc line
+    movq mm1, [r2+8]       ; 1st pSrc line + 8
+    movq mm2, [r2+r3]     ; 2nd pSrc line
+    movq mm3, [r2+r3+8]   ; 2nd pSrc line + 8
 
     ; to handle mm0, mm1, mm2, mm3
     pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
@@ -299,30 +319,32 @@
     pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
     pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-    movq [edi  ], mm0
+    movq [r0  ], mm0
 
     ; next SMB
-    lea esi, [esi+16]
-    lea edi, [edi+8]
+    lea r2, [r2+16]
+    lea r0, [r0+8]
 
-    dec eax
-    jg near .xloops
+    dec r4
+    jg near .xloops2
 
     ; next line
-    lea esi, [esi+2*ecx]    ; next end of lines
-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
-    lea edi, [edi+edx]
-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
+    lea r2, [r2+2*r3]    ; next end of lines
+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
 
-    dec ebp
-    jg near .yloops
+    dec r5
+    jg near .yloops2
 
     WELSEMMS
-    pop ebp
-    pop edi
-    pop esi
-    pop edx
-    pop ebx
+%ifndef X86_32
+    pop r12
+%endif
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
     ret
 
 ;***********************************************************************
@@ -331,28 +353,36 @@
 ;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-    push ebx
-    push edx
-    push esi
-    push edi
-    push ebp
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
 
-    mov edi, [esp+24]   ; pDst
-    mov edx, [esp+28]   ; iDstStride
-    mov esi, [esp+32]   ; pSrc
-    mov ecx, [esp+36]   ; iSrcStride
-    mov ebp, [esp+44]   ; iSrcHeight
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $01            ; iSrcHeight >> 1
 
-    sar ebp, $01        ; iSrcHeight >> 1
-
-.yloops:
-    mov eax, [esp+40]   ; iSrcWidth
-    sar eax, $01        ; iSrcWidth >> 1
-    mov ebx, eax        ; iDstWidth restored at ebx
-    sar eax, $02        ; (iSrcWidth >> 1) / 4      ; loop count = num_of_mb
-    neg ebx         ; - (iSrcWidth >> 1)
+.yloops3:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+    sar r4, $01            ; iSrcWidth >> 1
+    mov r6, r4        ; iDstWidth restored at ebx
+    sar r4, $02            ; (iSrcWidth >> 1) / 4     ; loop count = num_of_mb
+    neg r6             ; - (iSrcWidth >> 1)
     ; each loop = source bandwidth: 8 bytes
-.xloops:
+.xloops3:
     ; 1st part horizonal loop: x8 bytes
     ;               mem  hi<-       ->lo
     ;1st Line Src:  mm0: d D c C b B a A
@@ -361,8 +391,8 @@
     ;: H G F E D C B A
     ;: h g f e d c b a
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movq mm0, [esi]         ; 1st pSrc line
-    movq mm1, [esi+ecx]     ; 2nd pSrc line
+    movq mm0, [r2]         ; 1st pSrc line
+    movq mm1, [r2+r3]     ; 2nd pSrc line
 
     ; to handle mm0, mm1, mm2, mm3
     pshufw mm2, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
@@ -385,30 +415,32 @@
     pshufw mm1, mm0, 04eh   ; 01001110 B
     pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-    movd [edi], mm0
+    movd [r0], mm0
 
     ; next unit
-    lea esi, [esi+8]
-    lea edi, [edi+4]
+    lea r2, [r2+8]
+    lea r0, [r0+4]
 
-    dec eax
-    jg near .xloops
+    dec r4
+    jg near .xloops3
 
     ; next line
-    lea esi, [esi+2*ecx]    ; next end of lines
-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
-    lea edi, [edi+edx]
-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
+    lea r2, [r2+2*r3]    ; next end of lines
+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
 
-    dec ebp
-    jg near .yloops
+    dec r5
+    jg near .yloops3
 
     WELSEMMS
-    pop ebp
-    pop edi
-    pop esi
-    pop edx
-    pop ebx
+%ifndef X86_32
+    pop r12
+%endif
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
     ret
 
 
@@ -420,31 +452,56 @@
 ;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-    push ebx
-    push edx
-    push esi
-    push edi
-    push ebp
+    ;push ebx
+    ;push edx
+    ;push esi
+    ;push edi
+    ;push ebp
 
-    mov edi, [esp+24]   ; pDst
-    mov edx, [esp+28]   ; iDstStride
-    mov esi, [esp+32]   ; pSrc
-    mov ecx, [esp+36]   ; iSrcStride
-    mov ebp, [esp+44]   ; iSrcHeight
+    ;mov edi, [esp+24]   ; pDst
+    ;mov edx, [esp+28]   ; iDstStride
+    ;mov esi, [esp+32]   ; pSrc
+    ;mov ecx, [esp+36]   ; iSrcStride
+    ;mov ebp, [esp+44]   ; iSrcHeight
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
 
-    sar ebp, $01            ; iSrcHeight >> 1
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $01            ; iSrcHeight >> 1
 
     movdqa xmm7, [shufb_mask_low]   ; mask low
     movdqa xmm6, [shufb_mask_high]  ; mask high
 
-.yloops:
-    mov eax, [esp+40]   ; iSrcWidth
-    sar eax, $01            ; iSrcWidth >> 1
-    mov ebx, eax        ; iDstWidth restored at ebx
-    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    neg ebx             ; - (iSrcWidth >> 1)
+.yloops4:
+    ;mov eax, [esp+40]   ; iSrcWidth
+    ;sar eax, $01            ; iSrcWidth >> 1
+    ;mov ebx, eax        ; iDstWidth restored at ebx
+    ;sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    ;neg ebx             ; - (iSrcWidth >> 1)
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+    sar r4, $01            ; iSrcWidth >> 1
+    mov r6, r4        ; iDstWidth restored at ebx
+    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg r6             ; - (iSrcWidth >> 1)
     ; each loop = source bandwidth: 32 bytes
-.xloops:
+.xloops4:
     ; 1st part horizonal loop: x16 bytes
     ;               mem  hi<-       ->lo
     ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
@@ -458,10 +515,10 @@
     ;: p ..                          a
 
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movdqa xmm0, [esi]          ; 1st_src_line
-    movdqa xmm1, [esi+16]       ; 1st_src_line + 16
-    movdqa xmm2, [esi+ecx]      ; 2nd_src_line
-    movdqa xmm3, [esi+ecx+16]   ; 2nd_src_line + 16
+    movdqa xmm0, [r2]          ; 1st_src_line
+    movdqa xmm1, [r2+16]       ; 1st_src_line + 16
+    movdqa xmm2, [r2+r3]      ; 2nd_src_line
+    movdqa xmm3, [r2+r3+16]   ; 2nd_src_line + 16
 
     ; packing & avg
     movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
@@ -498,29 +555,33 @@
     pavgb xmm0, xmm2
 
     ; write pDst
-    movdqa [edi], xmm0
+    movdqa [r0], xmm0
 
     ; next SMB
-    lea esi, [esi+32]
-    lea edi, [edi+16]
+    lea r2, [r2+32]
+    lea r0, [r0+16]
 
-    dec eax
-    jg near .xloops
+    dec r4
+    jg near .xloops4
 
     ; next line
-    lea esi, [esi+2*ecx]    ; next end of lines
-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
-    lea edi, [edi+edx]
-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
+    lea r2, [r2+2*r3]    ; next end of lines
+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
 
-    dec ebp
-    jg near .yloops
+    dec r5
+    jg near .yloops4
 
-    pop ebp
-    pop edi
-    pop esi
-    pop edx
-    pop ebx
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
     ret
 
 ;***********************************************************************
@@ -529,30 +590,39 @@
 ;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-    push ebx
-    push edx
-    push esi
-    push edi
-    push ebp
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 6
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
 
-    mov edi, [esp+24]   ; pDst
-    mov edx, [esp+28]   ; iDstStride
-    mov esi, [esp+32]   ; pSrc
-    mov ecx, [esp+36]   ; iSrcStride
-    mov ebp, [esp+44]   ; iSrcHeight
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $01            ; iSrcHeight >> 1
+    movdqa xmm5, [shufb_mask_low]   ; mask low
+    movdqa xmm4, [shufb_mask_high]  ; mask high
 
-    sar ebp, $01        ; iSrcHeight >> 1
-    movdqa xmm7, [shufb_mask_low]   ; mask low
-    movdqa xmm6, [shufb_mask_high]  ; mask high
-
-.yloops:
-    mov eax, [esp+40]   ; iSrcWidth
-    sar eax, $01        ; iSrcWidth >> 1
-    mov ebx, eax        ; iDstWidth restored at ebx
-    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
-    neg ebx         ; - (iSrcWidth >> 1)
+.yloops5:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+    sar r4, $01            ; iSrcWidth >> 1
+    mov r6, r4        ; iDstWidth restored at ebx
+    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
+    neg r6             ; - (iSrcWidth >> 1)
     ; each loop = source bandwidth: 16 bytes
-.xloops:
+.xloops5:
     ; horizonal loop: x16 bytes by source
     ;               mem  hi<-       ->lo
     ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
@@ -562,13 +632,13 @@
     ;: h g f e d c b a, p o n m l k j i
 
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movdqa xmm0, [esi]          ; 1st_src_line
-    movdqa xmm1, [esi+ecx]      ; 2nd_src_line
+    movdqa xmm0, [r2]          ; 1st_src_line
+    movdqa xmm1, [r2+r3]      ; 2nd_src_line
 
     ; packing & avg
     movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
     ; another implementation for xmm2 high bits
 ;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
 ;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
@@ -575,8 +645,8 @@
     pavgb xmm0, xmm2
 
     movdqa xmm3, xmm1
-    pshufb xmm1, xmm7
-    pshufb xmm3, xmm6
+    pshufb xmm1, xmm5
+    pshufb xmm3, xmm4
 ;   psubb xmm3, xmm1
 ;   psrlw xmm3, 8
     pavgb xmm1, xmm3
@@ -585,29 +655,32 @@
     packuswb xmm0, xmm1
 
     ; write pDst
-    movq [edi], xmm0
+    movq [r0], xmm0
 
     ; next SMB
-    lea esi, [esi+16]
-    lea edi, [edi+8]
+    lea r2, [r2+16]
+    lea r0, [r0+8]
 
-    dec eax
-    jg near .xloops
+    dec r4
+    jg near .xloops5
 
-    ; next line
-    lea esi, [esi+2*ecx]    ; next end of lines
-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
-    lea edi, [edi+edx]
-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
+    lea r2, [r2+2*r3]    ; next end of lines
+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
 
-    dec ebp
-    jg near .yloops
+    dec r5
+    jg near .yloops5
 
-    pop ebp
-    pop edi
-    pop esi
-    pop edx
-    pop ebx
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
     ret
 
 ; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
@@ -617,31 +690,40 @@
 ;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-    push ebx
-    push edx
-    push esi
-    push edi
-    push ebp
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
 
-    mov edi, [esp+24]   ; pDst
-    mov edx, [esp+28]   ; iDstStride
-    mov esi, [esp+32]   ; pSrc
-    mov ecx, [esp+36]   ; iSrcStride
-    mov ebp, [esp+44]   ; iSrcHeight
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $01            ; iSrcHeight >> 1
 
-    sar ebp, $01            ; iSrcHeight >> 1
-
     movdqa xmm7, [shufb_mask_low]   ; mask low
     movdqa xmm6, [shufb_mask_high]  ; mask high
 
-.yloops:
-    mov eax, [esp+40]   ; iSrcWidth
-    sar eax, $01            ; iSrcWidth >> 1
-    mov ebx, eax        ; iDstWidth restored at ebx
-    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
-    neg ebx             ; - (iSrcWidth >> 1)
+.yloops6:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+    sar r4, $01            ; iSrcWidth >> 1
+    mov r6, r4        ; iDstWidth restored at ebx
+    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg r6             ; - (iSrcWidth >> 1)
     ; each loop = source bandwidth: 32 bytes
-.xloops:
+.xloops6:
     ; 1st part horizonal loop: x16 bytes
     ;               mem  hi<-       ->lo
     ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
@@ -655,10 +737,10 @@
     ;: p ..                          a
 
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movntdqa xmm0, [esi]            ; 1st_src_line
-    movntdqa xmm1, [esi+16]     ; 1st_src_line + 16
-    movntdqa xmm2, [esi+ecx]        ; 2nd_src_line
-    movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+    movntdqa xmm0, [r2]            ; 1st_src_line
+    movntdqa xmm1, [r2+16]     ; 1st_src_line + 16
+    movntdqa xmm2, [r2+r3]        ; 2nd_src_line
+    movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
 
     ; packing & avg
     movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
@@ -694,29 +776,32 @@
     pavgb xmm0, xmm2
 
     ; write pDst
-    movdqa [edi], xmm0
+    movdqa [r0], xmm0
 
     ; next SMB
-    lea esi, [esi+32]
-    lea edi, [edi+16]
+    lea r2, [r2+32]
+    lea r0, [r0+16]
 
-    dec eax
-    jg near .xloops
+    dec r4
+    jg near .xloops6
 
-    ; next line
-    lea esi, [esi+2*ecx]    ; next end of lines
-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
-    lea edi, [edi+edx]
-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
+    lea r2, [r2+2*r3]    ; next end of lines
+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
 
-    dec ebp
-    jg near .yloops
+    dec r5
+    jg near .yloops6
 
-    pop ebp
-    pop edi
-    pop esi
-    pop edx
-    pop ebx
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
     ret
 
 ;***********************************************************************
@@ -725,30 +810,39 @@
 ;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-    push ebx
-    push edx
-    push esi
-    push edi
-    push ebp
+%ifdef X86_32
+    push r6
+    %assign push_num 1
+%else
+    %assign push_num 0
+%endif
+    LOAD_6_PARA
+    PUSH_XMM 6
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r4, r4d
+    SIGN_EXTENSION r5, r5d
 
-    mov edi, [esp+24]   ; pDst
-    mov edx, [esp+28]   ; iDstStride
-    mov esi, [esp+32]   ; pSrc
-    mov ecx, [esp+36]   ; iSrcStride
-    mov ebp, [esp+44]   ; iSrcHeight
+%ifndef X86_32
+    push r12
+    mov r12, r4
+%endif
+    sar r5, $01            ; iSrcHeight >> 1
+    movdqa xmm5, [shufb_mask_low]   ; mask low
+    movdqa xmm4, [shufb_mask_high]  ; mask high
 
-    sar ebp, $01        ; iSrcHeight >> 1
-    movdqa xmm7, [shufb_mask_low]   ; mask low
-    movdqa xmm6, [shufb_mask_high]  ; mask high
-
-.yloops:
-    mov eax, [esp+40]   ; iSrcWidth
-    sar eax, $01        ; iSrcWidth >> 1
-    mov ebx, eax        ; iDstWidth restored at ebx
-    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
-    neg ebx         ; - (iSrcWidth >> 1)
+.yloops7:
+%ifdef X86_32
+    mov r4, arg5
+%else
+    mov r4, r12
+%endif
+    sar r4, $01            ; iSrcWidth >> 1
+    mov r6, r4        ; iDstWidth restored at ebx
+    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb
+    neg r6             ; - (iSrcWidth >> 1)
     ; each loop = source bandwidth: 16 bytes
-.xloops:
+.xloops7:
     ; horizonal loop: x16 bytes by source
     ;               mem  hi<-       ->lo
     ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
@@ -758,20 +852,20 @@
     ;: h g f e d c b a, p o n m l k j i
 
     ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    movntdqa xmm0, [esi]            ; 1st_src_line
-    movntdqa xmm1, [esi+ecx]        ; 2nd_src_line
+    movntdqa xmm0, [r2]            ; 1st_src_line
+    movntdqa xmm1, [r2+r3]        ; 2nd_src_line
 
     ; packing & avg
     movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
 ;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
 ;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
     pavgb xmm0, xmm2
 
     movdqa xmm3, xmm1
-    pshufb xmm1, xmm7
-    pshufb xmm3, xmm6
+    pshufb xmm1, xmm5
+    pshufb xmm3, xmm4
 ;   psubb xmm3, xmm1
 ;   psrlw xmm3, 8
     pavgb xmm1, xmm3
@@ -780,38 +874,40 @@
     packuswb xmm0, xmm1
 
     ; write pDst
-    movq [edi], xmm0
+    movq [r0], xmm0
 
     ; next SMB
-    lea esi, [esi+16]
-    lea edi, [edi+8]
+    lea r2, [r2+16]
+    lea r0, [r0+8]
 
-    dec eax
-    jg near .xloops
+    dec r4
+    jg near .xloops7
 
     ; next line
-    lea esi, [esi+2*ecx]    ; next end of lines
-    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
-    lea edi, [edi+edx]
-    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
+    lea r2, [r2+2*r3]    ; next end of lines
+    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]
+    lea r0, [r0+r1]
+    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]
 
-    dec ebp
-    jg near .yloops
+    dec r5
+    jg near .yloops7
 
-    pop ebp
-    pop edi
-    pop esi
-    pop edx
-    pop ebx
+%ifndef X86_32
+    pop r12
+%endif
+
+    POP_XMM
+    LOAD_6_PARA_POP
+%ifdef X86_32
+    pop r6
+%endif
     ret
 
 
-
-
-
+%ifdef X86_32
 ;**************************************************************************************************************
 ;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;                           unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned char* pSrc, const int iSrcStride,
 ;                           unsigned int uiScaleX, unsigned int uiScaleY );
 ;{
 ;**************************************************************************************************************
@@ -822,7 +918,7 @@
     push    edi
     push    ebx
 %define     pushsize    16
-%define     localsize   28
+%define     localsize   16
 %define     pDstData        esp + pushsize + localsize + 4
 %define     dwDstStride     esp + pushsize + localsize + 8
 %define     dwDstWidth      esp + pushsize + localsize + 12
@@ -829,19 +925,15 @@
 %define     dwDstHeight     esp + pushsize + localsize + 16
 %define     pSrcData        esp + pushsize + localsize + 20
 %define     dwSrcStride     esp + pushsize + localsize + 24
-%define     dwSrcWidth      esp + pushsize + localsize + 28
-%define     dwSrcHeight     esp + pushsize + localsize + 32
-%define     scale           esp + 0
-%define     uiScaleX            esp + pushsize + localsize + 36
-%define     uiScaleY            esp + pushsize + localsize + 40
-%define     tmpHeight       esp + 12
-%define     yInverse        esp + 16
-%define     xInverse        esp + 20
-%define     dstStep         esp + 24
+%define     uiScaleX            esp + pushsize + localsize + 28
+%define     uiScaleY            esp + pushsize + localsize + 32
+%define     tmpHeight       esp + 0
+%define     yInverse        esp + 4
+%define     xInverse        esp + 8
+%define     dstStep         esp + 12
     sub     esp,            localsize
 
     pxor    xmm0,   xmm0
-    mov     edx,    32767
     mov     eax,    [uiScaleX]
     and     eax,    32767
     mov     ebx,    eax
@@ -999,7 +1091,6 @@
 %undef      dwDstWidth
 %undef      dwDstHeight
 %undef      dwDstStride
-%undef      scale
 %undef      uiScaleX
 %undef      uiScaleY
 %undef      tmpHeight
@@ -1013,7 +1104,7 @@
 
 ;**************************************************************************************************************
 ;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;               unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned char* pSrc, const int iSrcStride,
 ;               unsigned int uiScaleX, unsigned int uiScaleY );
 ;{
 ;**************************************************************************************************************
@@ -1024,7 +1115,7 @@
     push    edi
     push    ebx
 %define     pushsize    16
-%define     localsize   28
+%define     localsize   16
 %define     pDstData        esp + pushsize + localsize + 4
 %define     dwDstStride     esp + pushsize + localsize + 8
 %define     dwDstWidth      esp + pushsize + localsize + 12
@@ -1031,15 +1122,12 @@
 %define     dwDstHeight     esp + pushsize + localsize + 16
 %define     pSrcData        esp + pushsize + localsize + 20
 %define     dwSrcStride     esp + pushsize + localsize + 24
-%define     dwSrcWidth      esp + pushsize + localsize + 28
-%define     dwSrcHeight     esp + pushsize + localsize + 32
-%define     scale           esp + 0
-%define     uiScaleX            esp + pushsize + localsize + 36
-%define     uiScaleY            esp + pushsize + localsize + 40
-%define     tmpHeight       esp + 12
-%define     yInverse        esp + 16
-%define     xInverse        esp + 20
-%define     dstStep         esp + 24
+%define     uiScaleX            esp + pushsize + localsize + 28
+%define     uiScaleY            esp + pushsize + localsize + 32
+%define     tmpHeight       esp + 0
+%define     yInverse        esp + 4
+%define     xInverse        esp + 8
+%define     dstStep         esp + 12
     sub     esp,            localsize
 
     pxor    xmm0,   xmm0
@@ -1191,10 +1279,7 @@
 %undef      dwSrcHeight
 %undef      dwSrcStride
 %undef      pDstData
-%undef      dwDstWidth
-%undef      dwDstHeight
 %undef      dwDstStride
-%undef      scale
 %undef      uiScaleX
 %undef      uiScaleY
 %undef      tmpHeight
@@ -1201,5 +1286,613 @@
 %undef      yInverse
 %undef      xInverse
 %undef      dstStep
+    ret
+
+%elifdef  WIN64
+
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;                           unsigned char* pSrc, const int iSrcStride,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+    push    r12
+    push    r13
+    push    r14
+    push    r15
+    push    rsi
+    push    rdi
+    push    rbx
+    push    rbp
+    %assign push_num 8
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
+    SIGN_EXTENSION r6, r6d
+
+    pxor    xmm0,   xmm0
+    mov     r12d,   r6d
+    and     r12d,   32767
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   32767
+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)
+    movd    xmm2,   r13d                     ; -uinc
+    psllq   xmm1,   32
+    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)
+    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc
+
+    mov     r12,    arg8
+    SIGN_EXTENSION r12, r12d
+    mov     rbp,    r12
+    and     r12d,   32767
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   32767
+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       r13d                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
+    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc
+
+    mov     r12d,        40003fffh
+    movd    xmm5,       r12d
+    punpcklwd   xmm5,   xmm0                    ; 16384 16383
+    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383
+
+DOWNSAMPLE:
+    sub     r1, r2                   ; stride - width
+    dec     r3
+    mov     r14,16384
+    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+    ;mov     r12, r4
+    mov     r12, r14
+    shr     r12,    15
+    imul    r12,    r5
+    add     r12,    r4                 ; get current row address
+    mov     r13,    r12
+    add     r13,    r5
+
+    mov     r15, 16384
+    mov     rsi, r2
+    dec     rsi
+    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383
+
+WIDTH:
+    mov     rdi,        r15
+    shr     rdi,        15
+
+    movd    xmm1,       [r12+rdi]       ; xxxxxxba
+    movd    xmm2,       [r13+rdi]       ; xxxxxxdc
+    pxor    xmm0,       xmm0
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
+    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a
+
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    movdqa  xmm0,   xmm2
+    pmuludq xmm2,   xmm1
+    psrlq   xmm0,   32
+    psrlq   xmm1,   32
+    pmuludq xmm0,   xmm1
+    paddq   xmm2,   xmm0
+    pshufd  xmm1,   xmm2,   00001110b
+    paddq   xmm2,   xmm1
+    psrlq   xmm2,   29
+
+    movd    ebx,    xmm2
+    inc     ebx
+    shr     ebx,    1
+    mov     [r0],   bl
+    inc     r0
+
+    add      r15, r6
+    paddw   xmm3,       xmm7            ; inc u
+    psllw   xmm3,       1
+    psrlw   xmm3,       1
+
+    dec     rsi
+    jg      WIDTH
+
+WIDTH_END:
+    shr     r15, 15
+    mov     bl,  [r12+r15]
+    mov     [r0],bl
+    inc     r0
+    add     r14, rbp
+    add     r0,  r1
+
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
+
+    dec     r3
+    jg      HEIGHT
+
+LAST_ROW:
+    shr     r14, 15
+    imul    r14, r5
+    add     r4, r14
+    mov     r15, 16384
+
+LAST_ROW_WIDTH:
+    mov     rdi, r15
+    shr     rdi, 15
+    mov     bl,  [r4+rdi]
+    mov     [r0],bl
+    inc     r0
+
+    add     r15, r6
+    dec     r2
+    jg    LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+    POP_XMM
+    pop     rbp
+    pop     rbx
+    pop     rdi
+    pop     rsi
+    pop     r15
+    pop     r14
+    pop     r13
+    pop     r12
+    ret
+
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;               unsigned char* pSrc, const int iSrcStride,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+    push    r12
+    push    r13
+    push    r14
+    push    r15
+    push    rsi
+    push    rdi
+    push    rbx
+    push    rbp
+    %assign push_num 8
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
+    SIGN_EXTENSION r6, r6d
+
+    pxor    xmm0,   xmm0
+    mov     r12d,   r6d
+    and     r12d,   65535
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   65535
+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)
+    movd    xmm2,   r13d                     ; -uinc
+    psllq   xmm1,   32
+    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc
+    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc
+
+    mov     r12,    arg8
+    SIGN_EXTENSION r12, r12d
+    mov     rbp,    r12
+    and     r12d,   32767
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   32767
+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       r13d                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
+    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc
+
+    mov     r12d,       80007fffh               ; 32768 32767
+    movd    xmm5,       r12d
+    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
+
+FAST_DOWNSAMPLE:
+    sub     r1, r2                   ; stride - width
+    dec     r3
+    mov     r14,16384
+
+    pshuflw xmm4,       xmm5,   01010000b
+    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+    mov     r12, r14
+    shr     r12,    15
+    imul    r12,    r5
+    add     r12,    r4                 ; get current row address
+    mov     r13,    r12
+    add     r13,    r5
+
+    mov     r15, 32768
+    mov     rsi, r2
+    dec     rsi
+
+    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+    mov     rdi,        r15
+    shr     rdi,        16
+
+    movd    xmm1,       [r12+rdi]       ; xxxxxxba
+    movd    xmm2,       [r13+rdi]       ; xxxxxxdc
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
+
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    pmaddwd     xmm2,   xmm1
+    pshufd  xmm1,   xmm2,   00000001b
+    paddd   xmm2,   xmm1
+    movdqa  xmm1,   [add_extra_half]
+    paddd   xmm2,   xmm1
+    psrld   xmm2,   15
+
+    packuswb    xmm2,   xmm0
+    movd    ebx,    xmm2
+    mov     [r0],  bl
+    inc     r0
+
+    add     r15, r6
+
+    paddw   xmm3,       xmm7            ; inc u
+    dec     rsi
+    jg      FAST_WIDTH
+
+FAST_WIDTH_END:
+    shr     r15, 16
+    mov     bl,  [r12+r15]
+    mov     [r0],bl
+    inc     r0
+    add     r14, rbp
+    add     r0,  r1
+
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
+
+    dec     r3
+    jg      FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+    shr     r14, 15
+    imul    r14, r5
+    add     r4, r14
+    mov     r15, 32768
+
+FAST_LAST_ROW_WIDTH:
+    mov     rdi, r15
+    shr     rdi, 16
+    mov     bl,  [r4+rdi]
+    mov     [r0],bl
+    inc     r0
+
+    add     r15, r6
+    dec     r2
+    jg      FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+    POP_XMM
+    pop     rbp
+    pop     rbx
+    pop     rdi
+    pop     rsi
+    pop     r15
+    pop     r14
+    pop     r13
+    pop     r12
+    ret
+
+%elifdef  UNIX64
+
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;                           unsigned char* pSrc, const int iSrcStride,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+    push    r12
+    push    r13
+    push    r14
+    push    r15
+    push    rbx
+    push    rbp
+    %assign push_num 6
+    LOAD_7_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
+    SIGN_EXTENSION r6, r6d
+
+    pxor    xmm0,   xmm0
+    mov     r12d,   r6d
+    and     r12d,   32767
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   32767
+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 32767)
+    movd    xmm2,   r13d                     ; -uinc
+    psllq   xmm1,   32
+    por     xmm1,   xmm2                    ; 0 0  uinc  -uinc   (dword)
+    pshufd  xmm7,   xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc
+
+    mov     r12,    arg8
+    SIGN_EXTENSION r12, r12d
+    mov     rbp,    r12
+    and     r12d,   32767
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   32767
+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       r13d                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
+    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc
+
+    mov     r12d,        40003fffh
+    movd    xmm5,       r12d
+    punpcklwd   xmm5,   xmm0                    ; 16384 16383
+    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383
+
+DOWNSAMPLE:
+    sub     r1, r2                   ; stride - width
+    dec     r3
+    mov     r14,16384
+    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+    ;mov     r12, r4
+    mov     r12, r14
+    shr     r12,    15
+    imul    r12,    r5
+    add     r12,    r4                 ; get current row address
+    mov     r13,    r12
+    add     r13,    r5
+
+    mov     r15, 16384
+    mov     rax, r2
+    dec     rax
+    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383
+
+WIDTH:
+    mov     r11,        r15
+    shr     r11,        15
+
+    movd    xmm1,       [r12+r11]       ; xxxxxxba
+    movd    xmm2,       [r13+r11]       ; xxxxxxdc
+    pxor    xmm0,       xmm0
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
+    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a
+
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    movdqa  xmm0,   xmm2
+    pmuludq xmm2,   xmm1
+    psrlq   xmm0,   32
+    psrlq   xmm1,   32
+    pmuludq xmm0,   xmm1
+    paddq   xmm2,   xmm0
+    pshufd  xmm1,   xmm2,   00001110b
+    paddq   xmm2,   xmm1
+    psrlq   xmm2,   29
+
+    movd    ebx,    xmm2
+    inc     ebx
+    shr     ebx,    1
+    mov     [r0],   bl
+    inc     r0
+
+    add      r15, r6
+    paddw   xmm3,       xmm7            ; inc u
+    psllw   xmm3,       1
+    psrlw   xmm3,       1
+
+    dec     rax
+    jg      WIDTH
+
+WIDTH_END:
+    shr     r15, 15
+    mov     bl,  [r12+r15]
+    mov     [r0],bl
+    inc     r0
+    add     r14, rbp
+    add     r0,  r1
+
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
+
+    dec     r3
+    jg      HEIGHT
+
+LAST_ROW:
+    shr     r14, 15
+    imul    r14, r5
+    add     r4, r14
+    mov     r15, 16384
+
+LAST_ROW_WIDTH:
+    mov     r11, r15
+    shr     r11, 15
+    mov     bl,  [r4+r11]
+    mov     [r0],bl
+    inc     r0
+
+    add     r15, r6
+    dec     r2
+    jg    LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+    pop     rbp
+    pop     rbx
+    pop     r15
+    pop     r14
+    pop     r13
+    pop     r12
+    ret
+
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;               unsigned char* pSrc, const int iSrcStride,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+    push    r12
+    push    r13
+    push    r14
+    push    r15
+    push    rbx
+    push    rbp
+    %assign push_num 6
+    LOAD_7_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
+    SIGN_EXTENSION r6, r6d
+
+    pxor    xmm0,   xmm0
+    mov     r12d,   r6d
+    and     r12d,   65535
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   65535
+    movd    xmm1,   r12d                     ; uinc(uiScaleX mod 65536)
+    movd    xmm2,   r13d                     ; -uinc
+    psllq   xmm1,   32
+    por     xmm1,   xmm2                    ; 0 uinc 0 -uinc
+    pshuflw xmm7,   xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc
+
+    mov     r12,    arg8
+    SIGN_EXTENSION r12, r12d
+    mov     rbp,    r12
+    and     r12d,   32767
+    mov     r13d,   r12d
+    neg     r13d
+    and     r13d,   32767
+    movd    xmm6,       r12d                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       r13d                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
+    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc
+
+    mov     r12d,       80007fffh               ; 32768 32767
+    movd    xmm5,       r12d
+    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
+
+FAST_DOWNSAMPLE:
+    sub     r1, r2                   ; stride - width
+    dec     r3
+    mov     r14,16384
+
+    pshuflw xmm4,       xmm5,   01010000b
+    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+    mov     r12, r14
+    shr     r12,    15
+    imul    r12,    r5
+    add     r12,    r4                 ; get current row address
+    mov     r13,    r12
+    add     r13,    r5
+
+    mov     r15, 32768
+    mov     rax, r2
+    dec     rax
+
+    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+    mov     r11,        r15
+    shr     r11,        16
+
+    movd    xmm1,       [r12+r11]       ; xxxxxxba
+    movd    xmm2,       [r13+r11]       ; xxxxxxdc
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
+
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    pmaddwd     xmm2,   xmm1
+    pshufd  xmm1,   xmm2,   00000001b
+    paddd   xmm2,   xmm1
+    movdqa  xmm1,   [add_extra_half]
+    paddd   xmm2,   xmm1
+    psrld   xmm2,   15
+
+    packuswb    xmm2,   xmm0
+    movd    ebx,    xmm2
+    mov     [r0],  bl
+    inc     r0
+
+    add     r15, r6
+
+    paddw   xmm3,       xmm7            ; inc u
+    dec     rax
+    jg      FAST_WIDTH
+
+FAST_WIDTH_END:
+    shr     r15, 16
+    mov     bl,  [r12+r15]
+    mov     [r0],bl
+    inc     r0
+    add     r14, rbp
+    add     r0,  r1
+
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
+
+    dec     r3
+    jg      FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+    shr     r14, 15
+    imul    r14, r5
+    add     r4, r14
+    mov     r15, 32768
+
+FAST_LAST_ROW_WIDTH:
+    mov     r11, r15
+    shr     r11, 16
+    mov     bl,  [r4+r11]
+    mov     [r0],bl
+    inc     r0
+
+    add     r15, r6
+    dec     r2
+    jg      FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+    pop     rbp
+    pop     rbx
+    pop     r15
+    pop     r14
+    pop     r13
+    pop     r12
     ret
 %endif