shithub: openh264

--- a/codec/common/x86/asm_inc.asm

+++ b/codec/common/x86/asm_inc.asm

@@ -485,7 +485,7 @@

 %endmacro

 %macro WELS_EXTERN 1

-    ALIGN 16

+    ALIGN 16, nop

     %ifdef PREFIX

         global _%1

         %define %1 _%1

--- a/codec/processing/src/common/util.h

+++ b/codec/processing/src/common/util.h

@@ -83,10 +83,6 @@

 #define WELS_CLAMP(x, minv, maxv)  WELS_MIN(WELS_MAX(x, minv), maxv)

 #define ALIGNBYTES         (16)       /* Worst case is requiring alignment to an 16 byte boundary */

-#define WELS_ALIGN(iInput)   ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))

-#define WELS_ALIGN2(iInput)  ((iInput+1) & ~1)

-#define WELS_ALIGN4(iInput)  ((iInput+3) & ~3)

-#define WELS_ALIGN8(iInput)  ((iInput+7) & ~7)

 #define WelsCastFromPointer(p)      (reinterpret_cast<intptr_t>(p))

 #define WelsStaticCast(type, p)  (static_cast<type>(p))

--- a/codec/processing/src/downsample/downsample.cpp

+++ b/codec/processing/src/downsample/downsample.cpp

@@ -32,6 +32,7 @@

 #include "downsample.h"

 #include "cpu.h"

+#include <cassert>

 WELSVP_NAMESPACE_BEGIN

 #define MAX_SAMPLE_WIDTH 1920

@@ -75,11 +76,10 @@

     WelsFree (m_pSampleBuffer[i][2]);

 void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int32_t iCpuFlag) {

-  sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;

-  sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;

-  sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;

-  sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;

+  sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsampler_c;

+  sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_c;

   sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;

   sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_c;

   sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsampler_c;

@@ -86,9 +86,8 @@

   sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsampler_c;

 #if defined(X86_ASM)

   if (iCpuFlag & WELS_CPU_SSE) {

-    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse;

-    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse;

-    sDownsampleFunc.pfHalfAverage[2]    = DyadicBilinearDownsamplerWidthx8_sse;

+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse;

+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse;

     sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;

   if (iCpuFlag & WELS_CPU_SSE2) {

@@ -96,15 +95,13 @@

     sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;

   if (iCpuFlag & WELS_CPU_SSSE3) {

-    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_ssse3;

-    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_ssse3;

+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_ssse3;

+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_ssse3;

     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;

     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_ssse3;

     sDownsampleFunc.pfGeneralRatioLuma    = GeneralBilinearFastDownsamplerWrap_ssse3;

   if (iCpuFlag & WELS_CPU_SSE41) {

-    sDownsampleFunc.pfHalfAverage[0]    = DyadicBilinearDownsamplerWidthx32_sse4;

-    sDownsampleFunc.pfHalfAverage[1]    = DyadicBilinearDownsamplerWidthx16_sse4;

     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;

     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;

     sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;

@@ -117,10 +114,8 @@

 #if defined(HAVE_NEON)

   if (iCpuFlag & WELS_CPU_NEON) {

-    sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;

-    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;

-    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;

-    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;

+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_neon;

+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_neon;

     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;

     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_neon;

     sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;

@@ -130,10 +125,8 @@

 #if defined(HAVE_NEON_AARCH64)

   if (iCpuFlag & WELS_CPU_NEON) {

-    sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_AArch64_neon;

-    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;

-    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;

-    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;

+    sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_AArch64_neon;

+    sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_AArch64_neon;

     sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;

     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_AArch64_neon;

     sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

@@ -159,14 +152,11 @@

   if (iSrcWidthY > MAX_SAMPLE_WIDTH || iSrcHeightY > MAX_SAMPLE_HEIGHT || m_bNoSampleBuffer) {

     if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {

       // use half average functions

-      uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);

-      m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],

+      DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],

           (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);

-      iAlignIndex = GetAlignedIndex (iSrcWidthUV);

-      m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],

+      DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],

           (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);

-      m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],

+      DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],

           (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);

     } else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {

@@ -223,29 +213,23 @@

     do {

       if ((iHalfSrcWidth == iDstWidthY) && (iHalfSrcHeight == iDstHeightY)) { //end

         // use half average functions

-        uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);

-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],

+        DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],

             (uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);

-        iAlignIndex = GetAlignedIndex (iSrcWidthUV);

-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],

+        DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],

             (uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);

-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],

+        DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],

             (uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);

         break;

       } else if (((iHalfSrcWidth >> 1) >= iDstWidthY) && ((iHalfSrcHeight >> 1) >= iDstHeightY)) {

         // use half average functions

-        iDstStrideY = iHalfSrcWidth;

-        iDstStrideU = iHalfSrcWidth >> 1;

-        iDstStrideV = iHalfSrcWidth >> 1;

-        uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);

-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstY, iDstStrideY,

+        iDstStrideY = WELS_ALIGN (iHalfSrcWidth, 32);

+        iDstStrideU = WELS_ALIGN (iHalfSrcWidth >> 1, 32);

+        iDstStrideV = WELS_ALIGN (iHalfSrcWidth >> 1, 32);

+        DownsampleHalfAverage ((uint8_t*)pDstY, iDstStrideY,

             (uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);

-        iAlignIndex = GetAlignedIndex (iSrcWidthUV);

-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstU, iDstStrideU,

+        DownsampleHalfAverage ((uint8_t*)pDstU, iDstStrideU,

             (uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);

-        m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstV, iDstStrideV,

+        DownsampleHalfAverage ((uint8_t*)pDstV, iDstStrideV,

             (uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);

         pSrcY = (uint8_t*)pDstY;

@@ -258,9 +242,9 @@

         iSrcHeightY = iHalfSrcHeight;

         iSrcHeightUV = iHalfSrcHeight >> 1;

-        iSrcStrideY = iSrcWidthY;

-        iSrcStrideU = iSrcWidthUV;

-        iSrcStrideV = iSrcWidthUV;

+        iSrcStrideY = iDstStrideY;

+        iSrcStrideU = iDstStrideU;

+        iSrcStrideV = iDstStrideV;

         iHalfSrcWidth >>= 1;

         iHalfSrcHeight >>= 1;

@@ -286,17 +270,18 @@

   return RET_SUCCESS;

-int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {

-  int32_t iAlignIndex;

-  if ((kiSrcWidth & 0x1f) == 0)         // x32

-    iAlignIndex = 0;

-  else if ((kiSrcWidth & 0x0f) == 0)    // x16

-    iAlignIndex = 1;

-  else if ((kiSrcWidth & 0x07) == 0)    // x8

-    iAlignIndex = 2;

-  else

-    iAlignIndex = 3;

-  return iAlignIndex;

+void CDownsampling::DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,

+        uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight) {

+  if ((iSrcStride & 31) == 0) {

+    assert ((iDstStride & 15) == 0);

+    m_pfDownsample.pfHalfAverageWidthx32 (pDst, iDstStride,

+        pSrc, iSrcStride, WELS_ALIGN (iSrcWidth & ~1, 32), iSrcHeight);

+  } else {

+    assert ((iSrcStride & 15) == 0);

+    assert ((iDstStride &  7) == 0);

+    m_pfDownsample.pfHalfAverageWidthx16 (pDst, iDstStride,

+        pSrc, iSrcStride, WELS_ALIGN (iSrcWidth & ~1, 16), iSrcHeight);

+  }

--- a/codec/processing/src/downsample/downsample.h

+++ b/codec/processing/src/downsample/downsample.h

@@ -73,8 +73,8 @@

 SpecificDownsampleFunc  DyadicBilinearQuarterDownsampler_c;

 typedef struct {

-  // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;

-  PHalveDownsampleFunc          pfHalfAverage[4];

+  PHalveDownsampleFunc          pfHalfAverageWidthx32;

+  PHalveDownsampleFunc          pfHalfAverageWidthx16;

   PSpecificDownsampleFunc       pfOneThirdDownsampler;

   PSpecificDownsampleFunc       pfQuarterDownsampler;

   PGeneralDownsampleFunc        pfGeneralRatioLuma;

@@ -94,10 +94,6 @@

 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_ssse3;

 // iSrcWidth= x32 pixels

 HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_ssse3;

-// iSrcWidth= x16 pixels

-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx16_sse4;

-// iSrcWidth= x32 pixels

-HalveDownsampleFunc     DyadicBilinearDownsamplerWidthx32_sse4;

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;

 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;

@@ -185,7 +181,8 @@

  private:

   void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);

-  int32_t GetAlignedIndex (const int32_t kiSrcWidth);

+  void DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,

+      uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight);

   bool AllocateSampleBuffer();

   void FreeSampleBuffer();

  private:

--- a/codec/processing/src/x86/downsample_bilinear.asm

+++ b/codec/processing/src/x86/downsample_bilinear.asm

@@ -40,6 +40,10 @@

 ;*************************************************************************/

 %include "asm_inc.asm"

+%ifdef __NASM_VER__

+    %use smartalign

+%endif

 ;***********************************************************************

 ; Macros and other preprocessor constants

 ;***********************************************************************

@@ -471,7 +475,6 @@

-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse

 ;***********************************************************************

 ;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,

 ;                   unsigned char* pSrc, const int iSrcStride,

@@ -478,17 +481,6 @@

 ;                   const int iSrcWidth, const int iSrcHeight );

 ;***********************************************************************

 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3

-    ;push ebx

-    ;push edx

-    ;push esi

-    ;push edi

-    ;push ebp

-    ;mov edi, [esp+24]   ; pDst

-    ;mov edx, [esp+28]   ; iDstStride

-    ;mov esi, [esp+32]   ; pSrc

-    ;mov ecx, [esp+36]   ; iSrcStride

-    ;mov ebp, [esp+44]   ; iSrcHeight

 %ifdef X86_32

     push r6

     %assign push_num 1

@@ -496,7 +488,7 @@

     %assign push_num 0

 %endif

     LOAD_6_PARA

-    PUSH_XMM 8

+    PUSH_XMM 4

     SIGN_EXTENSION r1, r1d

     SIGN_EXTENSION r3, r3d

     SIGN_EXTENSION r4, r4d

@@ -508,15 +500,12 @@

 %endif

     sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low

-    movdqa xmm6, [shufb_mask_high]  ; mask high

+    WELS_DB1 xmm3

+    WELS_Zero xmm2

+    sar r4, $01            ; iSrcWidth >> 1

+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops4:

-    ;mov eax, [esp+40]   ; iSrcWidth

-    ;sar eax, $01            ; iSrcWidth >> 1

-    ;mov ebx, eax        ; iDstWidth restored at ebx

-    ;sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    ;neg ebx             ; - (iSrcWidth >> 1)

 %ifdef X86_32

     mov r4, arg5

 %else

@@ -523,81 +512,32 @@

     mov r4, r12

 %endif

     sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

+    neg r4                 ; -(iSrcWidth >> 1)

+    mov r6, r4

+    align 16

     ; each loop = source bandwidth: 32 bytes

 .xloops4:

-    ; 1st part horizonal loop: x16 bytes

-    ;               mem  hi<-       ->lo

-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

-    ;               xmm1: p P o O n N m M l L k K j J i I

-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A

-    ;               xmm3: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: P O N M L K J I H G F E D C B A

-    ;: p o n m l k j i h g f e d c b a

-    ;: P ..                          A

-    ;: p ..                          a

+    movdqa xmm0, [r2+r3]

+    movdqa xmm1, [r2+r3+16]

+    pavgb  xmm0, [r2]          ; avg vertical pixels 0-15

+    pavgb  xmm1, [r2+16]       ; avg vertical pixels 16-31

+    add r2, 32                 ; pSrc += 32

+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels 0-15

+    pmaddubsw xmm1, xmm3       ; pairwise horizontal sum neighboring pixels 16-31

+    pavgw xmm0, xmm2           ; (sum + 1) >> 1

+    pavgw xmm1, xmm2           ; (sum + 1) >> 1

+    packuswb xmm0, xmm1        ; pack words to bytes

+    movdqa [r0+r4], xmm0       ; store results

+    add r4, 16

+    jl .xloops4

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movdqa xmm0, [r2]          ; 1st_src_line

-    movdqa xmm1, [r2+16]       ; 1st_src_line + 16

-    movdqa xmm2, [r2+r3]      ; 2nd_src_line

-    movdqa xmm3, [r2+r3+16]   ; 2nd_src_line + 16

-    ; packing & avg

-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    ; another implementation for xmm4 high bits

-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm4

-    movdqa xmm5, xmm1

-    pshufb xmm1, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm1

-;   psrlw xmm5, 8

-    pavgb xmm1, xmm5

-    movdqa xmm4, xmm2

-    pshufb xmm2, xmm7

-    pshufb xmm4, xmm6

-;   psubb xmm4, xmm2

-;   psrlw xmm4, 8

-    pavgb xmm2, xmm4

-    movdqa xmm5, xmm3

-    pshufb xmm3, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm3

-;   psrlw xmm5, 8

-    pavgb xmm3, xmm5

-    packuswb xmm0, xmm1

-    packuswb xmm2, xmm3

-    pavgb xmm0, xmm2

-    ; write pDst

-    movdqa [r0], xmm0

-    ; next SMB

-    lea r2, [r2+32]

-    lea r0, [r0+16]

-    dec r4

-    jg near .xloops4

     ; next line

     lea r2, [r2+2*r3]    ; next end of lines

     lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

     lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops4

+    sub r5, 1

+    jg .yloops4

 %ifndef X86_32

     pop r12

@@ -623,7 +563,7 @@

     %assign push_num 0

 %endif

     LOAD_6_PARA

-    PUSH_XMM 6

+    PUSH_XMM 4

     SIGN_EXTENSION r1, r1d

     SIGN_EXTENSION r3, r3d

     SIGN_EXTENSION r4, r4d

@@ -634,8 +574,11 @@

     mov r12, r4

 %endif

     sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm5, [shufb_mask_low]   ; mask low

-    movdqa xmm4, [shufb_mask_high]  ; mask high

+    WELS_DB1 xmm3

+    WELS_Zero xmm2

+    add r2, r4             ; pSrc += iSrcWidth

+    sar r4, $01            ; iSrcWidth >> 1

+    add r0, r4             ; pDst += iSrcWidth >> 1

 .yloops5:

 %ifdef X86_32

@@ -644,279 +587,26 @@

     mov r4, r12

 %endif

     sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

+    neg r4                 ; -(iSrcWidth >> 1)

+    lea r6, [r2+r3]        ; pSrc + iSrcStride

+    align 16

     ; each loop = source bandwidth: 16 bytes

 .xloops5:

-    ; horizonal loop: x16 bytes by source

-    ;               mem  hi<-       ->lo

-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A

-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: H G F E D C B A, P O N M L K J I

-    ;: h g f e d c b a, p o n m l k j i

+    movdqa xmm0, [r2+2*r4]

+    pavgb  xmm0, [r6+2*r4]     ; avg vertical pixels

+    pmaddubsw xmm0, xmm3       ; pairwise horizontal sum neighboring pixels

+    pavgw xmm0, xmm2           ; (sum + 1) >> 1

+    packuswb xmm0, xmm0        ; pack words to bytes

+    movlps [r0+r4], xmm0       ; store results

+    add r4, 8

+    jl .xloops5

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movdqa xmm0, [r2]          ; 1st_src_line

-    movdqa xmm1, [r2+r3]      ; 2nd_src_line

-    ; packing & avg

-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    ; another implementation for xmm2 high bits

-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm2

-    movdqa xmm3, xmm1

-    pshufb xmm1, xmm5

-    pshufb xmm3, xmm4

-;   psubb xmm3, xmm1

-;   psrlw xmm3, 8

-    pavgb xmm1, xmm3

-    pavgb xmm0, xmm1

-    packuswb xmm0, xmm1

-    ; write pDst

-    movq [r0], xmm0

-    ; next SMB

-    lea r2, [r2+16]

-    lea r0, [r0+8]

-    dec r4

-    jg near .xloops5

-    lea r2, [r2+2*r3]    ; next end of lines

-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

-    lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops5

-%ifndef X86_32

-    pop r12

-%endif

-    POP_XMM

-    LOAD_6_PARA_POP

-%ifdef X86_32

-    pop r6

-%endif

-    ret

-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse

-;***********************************************************************

-;   void DyadicBilinearDownsamplerWidthx32_sse4(    unsigned char* pDst, const int iDstStride,

-;                   unsigned char* pSrc, const int iSrcStride,

-;                   const int iSrcWidth, const int iSrcHeight );

-;***********************************************************************

-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4

-%ifdef X86_32

-    push r6

-    %assign push_num 1

-%else

-    %assign push_num 0

-%endif

-    LOAD_6_PARA

-    PUSH_XMM 8

-    SIGN_EXTENSION r1, r1d

-    SIGN_EXTENSION r3, r3d

-    SIGN_EXTENSION r4, r4d

-    SIGN_EXTENSION r5, r5d

-%ifndef X86_32

-    push r12

-    mov r12, r4

-%endif

-    sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm7, [shufb_mask_low]   ; mask low

-    movdqa xmm6, [shufb_mask_high]  ; mask high

-.yloops6:

-%ifdef X86_32

-    mov r4, arg5

-%else

-    mov r4, r12

-%endif

-    sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

-    ; each loop = source bandwidth: 32 bytes

-.xloops6:

-    ; 1st part horizonal loop: x16 bytes

-    ;               mem  hi<-       ->lo

-    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A

-    ;               xmm1: p P o O n N m M l L k K j J i I

-    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A

-    ;               xmm3: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: P O N M L K J I H G F E D C B A

-    ;: p o n m l k j i h g f e d c b a

-    ;: P ..                          A

-    ;: p ..                          a

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movntdqa xmm0, [r2]            ; 1st_src_line

-    movntdqa xmm1, [r2+16]     ; 1st_src_line + 16

-    movntdqa xmm2, [r2+r3]        ; 2nd_src_line

-    movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16

-    ; packing & avg

-    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm4

-    movdqa xmm5, xmm1

-    pshufb xmm1, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm1

-;   psrlw xmm5, 8

-    pavgb xmm1, xmm5

-    movdqa xmm4, xmm2

-    pshufb xmm2, xmm7

-    pshufb xmm4, xmm6

-;   psubb xmm4, xmm2

-;   psrlw xmm4, 8

-    pavgb xmm2, xmm4

-    movdqa xmm5, xmm3

-    pshufb xmm3, xmm7

-    pshufb xmm5, xmm6

-;   psubb xmm5, xmm3

-;   psrlw xmm5, 8

-    pavgb xmm3, xmm5

-    packuswb xmm0, xmm1

-    packuswb xmm2, xmm3

-    pavgb xmm0, xmm2

-    ; write pDst

-    movdqa [r0], xmm0

-    ; next SMB

-    lea r2, [r2+32]

-    lea r0, [r0+16]

-    dec r4

-    jg near .xloops6

-    lea r2, [r2+2*r3]    ; next end of lines

-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

-    lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops6

-%ifndef X86_32

-    pop r12

-%endif

-    POP_XMM

-    LOAD_6_PARA_POP

-%ifdef X86_32

-    pop r6

-%endif

-    ret

-;***********************************************************************

-;   void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,

-;                     unsigned char* pSrc, const int iSrcStride,

-;                     const int iSrcWidth, const int iSrcHeight );

-;***********************************************************************

-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4

-%ifdef X86_32

-    push r6

-    %assign push_num 1

-%else

-    %assign push_num 0

-%endif

-    LOAD_6_PARA

-    PUSH_XMM 6

-    SIGN_EXTENSION r1, r1d

-    SIGN_EXTENSION r3, r3d

-    SIGN_EXTENSION r4, r4d

-    SIGN_EXTENSION r5, r5d

-%ifndef X86_32

-    push r12

-    mov r12, r4

-%endif

-    sar r5, $01            ; iSrcHeight >> 1

-    movdqa xmm5, [shufb_mask_low]   ; mask low

-    movdqa xmm4, [shufb_mask_high]  ; mask high

-.yloops7:

-%ifdef X86_32

-    mov r4, arg5

-%else

-    mov r4, r12

-%endif

-    sar r4, $01            ; iSrcWidth >> 1

-    mov r6, r4        ; iDstWidth restored at ebx

-    sar r4, $03            ; (iSrcWidth >> 1) / 8     ; loop count = num_of_mb

-    neg r6             ; - (iSrcWidth >> 1)

-    ; each loop = source bandwidth: 16 bytes

-.xloops7:

-    ; horizonal loop: x16 bytes by source

-    ;               mem  hi<-       ->lo

-    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A

-    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I

-    ;=> target:

-    ;: H G F E D C B A, P O N M L K J I

-    ;: h g f e d c b a, p o n m l k j i

-    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-    movntdqa xmm0, [r2]            ; 1st_src_line

-    movntdqa xmm1, [r2+r3]        ; 2nd_src_line

-    ; packing & avg

-    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A

-    pshufb xmm0, xmm5           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A

-    pshufb xmm2, xmm4           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0

-;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a

-    pavgb xmm0, xmm2

-    movdqa xmm3, xmm1

-    pshufb xmm1, xmm5

-    pshufb xmm3, xmm4

-;   psubb xmm3, xmm1

-;   psrlw xmm3, 8

-    pavgb xmm1, xmm3

-    pavgb xmm0, xmm1

-    packuswb xmm0, xmm1

-    ; write pDst

-    movq [r0], xmm0

-    ; next SMB

-    lea r2, [r2+16]

-    lea r0, [r0+8]

-    dec r4

-    jg near .xloops7

     ; next line

     lea r2, [r2+2*r3]    ; next end of lines

-    lea r2, [r2+2*r6]    ; reset to base 0 [- 2 * iDstWidth]

     lea r0, [r0+r1]

-    lea r0, [r0+r6]      ; reset to base 0 [- iDstWidth]

-    dec r5

-    jg near .yloops7

+    sub r5, 1

+    jg .yloops5

 %ifndef X86_32

     pop r12

--- a/test/api/decode_api_test.cpp

+++ b/test/api/decode_api_test.cpp

@@ -759,9 +759,17 @@

 const uint32_t kiFrameRate = 12; //DO NOT CHANGE!

 const uint32_t kiFrameNum = 100; //DO NOT CHANGE!

 const char* pHashStr[] = { //DO NOT CHANGE!

+// X86_ASM downsampling routines average vertically first, as opposed to

+// horizontally first, which results in different output.

+#ifdef X86_ASM

+  "244eebcb51f4c2a56e83fc5da3373cad9ec0e1e5",

+  "bbad99ef99e37b34bcb4f09a7ec4d144375f6be7",

+  "809f97e836650624d92f0b8e200a6ab25f810d6f"

+#else

   "9c4e6146b29bac5d5d4be3c5bbab9c072dcb3f3f",

   "f350001c333902029800bd291fbed915a4bdf19a",

   "eb9d853b7daec03052c4850027ac94adc84c3a7e"

+#endif

};

 class DecodeParseAPI : public ::testing::TestWithParam<EncodeDecodeFileParamBase>, public EncodeDecodeTestBase {

--- a/test/api/encoder_test.cpp

+++ b/test/api/encoder_test.cpp

@@ -123,7 +123,14 @@

},

     "res/CiscoVT2people_320x192_12fps.yuv",

-    "73156dfc1dc45924349b5b79f8debcac13d7231d", CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false

+// X86_ASM downsampling routines average vertically first, as opposed to

+// horizontally first, which results in different output.

+#ifdef X86_ASM

+    "a5341d588b769809c1f1d983e5a0fcef7362f3ad",

+#else

+    "73156dfc1dc45924349b5b79f8debcac13d7231d",

+#endif

+    CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false

},

     "res/Cisco_Absolute_Power_1280x720_30fps.yuv",

@@ -131,7 +138,14 @@

},

     "res/Cisco_Absolute_Power_1280x720_30fps.yuv",

-    "3943145545a2bd27a642b2045d4e3dbae55c6870", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false

+// X86_ASM downsampling routines average vertically first, as opposed to

+// horizontally first, which results in different output.

+#ifdef X86_ASM

+    "ec9d776a7d92cf0f6640065aee8af2450af0e993",

+#else

+    "3943145545a2bd27a642b2045d4e3dbae55c6870",

+#endif

+    CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false

},

   // the following values may be adjusted for times since we start tuning the strategy

--- a/test/processing/ProcessUT_DownSample.cpp

+++ b/test/processing/ProcessUT_DownSample.cpp

@@ -30,6 +30,27 @@

+void DyadicBilinearDownsampler2_ref (uint8_t* pDst, const int32_t kiDstStride,

+                                     const uint8_t* pSrc, const int32_t kiSrcStride,

+                                     const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

+  uint8_t* pDstLine = pDst;

+  const uint8_t* pSrcLine1 = pSrc;

+  const uint8_t* pSrcLine2 = pSrc + kiSrcStride;

+  const int32_t kiDstWidth  = kiSrcWidth >> 1;

+  const int32_t kiDstHeight = kiSrcHeight >> 1;

+  for (int32_t j = 0; j < kiDstHeight; j++) {

+    for (int32_t i = 0; i < kiDstWidth; i++) {

+      const int32_t kiTempCol1 = (pSrcLine1[2 * i + 0] + pSrcLine2[2 * i + 0] + 1) >> 1;

+      const int32_t kiTempCol2 = (pSrcLine1[2 * i + 1] + pSrcLine2[2 * i + 1] + 1) >> 1;

+      pDstLine[i] = (uint8_t) ((kiTempCol1 + kiTempCol2 + 1) >> 1);

+    }

+    pDstLine += kiDstStride;

+    pSrcLine1 += 2 * kiSrcStride;

+    pSrcLine2 += 2 * kiSrcStride;

+  }

+}

 void GeneralBilinearFastDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,

     const int32_t kiDstHeight,

     uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

@@ -162,7 +183,7 @@

-#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \

+#define GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, ref_func) \

 TEST (DownSampleTest, func) { \

   if (ASM) {\

     int32_t iCpuCores = 0; \

@@ -190,7 +211,7 @@

     dst_c[j] = dst_a[j] = rand() % 256; \

     src_c[j] = src_a[j] = rand() % 256; \

} \

-  DyadicBilinearDownsampler_ref (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \

+  ref_func (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \

   func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \

   for (int j = 0; j < (src_height_c >> 1); j++) { \

     for (int m = 0; m < (src_width_c >> 1); m++) { \

@@ -199,6 +220,11 @@

} \

+#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \

+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler_ref)

+#define GENERATE_DyadicBilinearDownsampler2_UT(func, ASM, CPUFLAGS) \

+  GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler2_ref)

 #define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \

 TEST (DownSampleTest, func) { \

   if (ASM) {\

@@ -328,11 +354,8 @@

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse, 1, WELS_CPU_SSE)

 GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx8_sse, 1, WELS_CPU_SSE)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)

-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)

+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)

+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)

 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)

 GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)