shithub: openh264

--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h

+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h

@@ -80,6 +80,8 @@

 #ifdef  X86_ASM

 int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,

                             int32_t iEndIdx);

+int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,

+                             int32_t iEndIdx);

 #endif

 #if defined(__cplusplus)

--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp

+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp

@@ -279,6 +279,11 @@

     pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;

 #endif

+#ifdef X86_ASM

+  if (uiCpuFlag & WELS_CPU_SSE42) {

+    pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;

+  }

+#endif

   if (iEntropyCodingModeFlag) {

     pFuncList->pfStashMBStatus = StashMBStatusCabac;

     pFuncList->pfStashPopMBStatus = StashPopMBStatusCabac;

--- a/codec/encoder/core/x86/coeff.asm

+++ b/codec/encoder/core/x86/coeff.asm

@@ -42,10 +42,57 @@

 %include "asm_inc.asm"

+SECTION .rodata align=16

+align 16

+wels_shufb_rev:

+    db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

+; 4-bit table giving number of preceding zeros for each set bit as well as the

+; eventual next bit. For the case where all 4 bits are set, this requires 5

+; zeros. The 5th zero can either be read from beyond the final table entry or

+; implied via zero-initializing the location being read into.

+wels_cavlc_param_cal_run_lut:

+    db 4, 0, 0, 0

+    db 0, 3, 0, 0

+    db 1, 2, 0, 0

+    db 0, 0, 2, 0

+    db 2, 1, 0, 0

+    db 0, 1, 1, 0

+    db 1, 0, 1, 0

+    db 0, 0, 0, 1

+    db 3, 0, 0, 0

+    db 0, 2, 0, 0

+    db 1, 1, 0, 0

+    db 0, 0, 1, 0

+    db 2, 0, 0, 0

+    db 0, 1, 0, 0

+    db 1, 0, 0, 0

+    db 0, 0, 0, 0

+;   db 0

+; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing

+; the words that match zero bits and concatenating in reverse order.

+wels_cavlc_param_cal_shufb_lut:

+    db 0, 0, 0, 0, 0, 0, 0, 0

+    db 6, 7, 0, 0, 0, 0, 0, 0

+    db 4, 5, 0, 0, 0, 0, 0, 0

+    db 6, 7, 4, 5, 0, 0, 0, 0

+    db 2, 3, 0, 0, 0, 0, 0, 0

+    db 6, 7, 2, 3, 0, 0, 0, 0

+    db 4, 5, 2, 3, 0, 0, 0, 0

+    db 6, 7, 4, 5, 2, 3, 0, 0

+    db 0, 1, 0, 0, 0, 0, 0, 0

+    db 6, 7, 0, 1, 0, 0, 0, 0

+    db 4, 5, 0, 1, 0, 0, 0, 0

+    db 6, 7, 4, 5, 0, 1, 0, 0

+    db 2, 3, 0, 1, 0, 0, 0, 0

+    db 6, 7, 2, 3, 0, 1, 0, 0

+    db 4, 5, 2, 3, 0, 1, 0, 0

+    db 6, 7, 4, 5, 2, 3, 0, 1

 %ifdef X86_32

-SECTION .rodata align=16

 align 16

 sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8

@@ -312,6 +359,8 @@

     db 7,6,5,4,3,2,1,7, ;254

     db 7,6,5,4,3,2,1,8, ;255

+%endif ; X86_32

 ;***********************************************************************

 ; Code

 ;***********************************************************************

@@ -318,6 +367,7 @@

 SECTION .text

+%ifdef X86_32

 ;***********************************************************************

 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);

@@ -457,3 +507,162 @@

     pop ebx

ret

 %endif

+;***********************************************************************

+;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);

+;***********************************************************************

+WELS_EXTERN CavlcParamCal_sse42

+%define p_coeff_level  r0

+%define p_run          r1

+%define p_level        r2

+%define p_total_coeffs r3

+%define i_endidxd      r4d

+%ifdef X86_32

+    push            r5

+    push            r6

+    %assign push_num 2

+    %define r_mask  r5

+    %define r_maskd r5d

+    %define p_shufb_lut wels_cavlc_param_cal_shufb_lut

+    %define p_run_lut   wels_cavlc_param_cal_run_lut

+%elifdef WIN64

+    push            rbx

+    %assign push_num 1

+    %define r_mask  rbx

+    %define r_maskd ebx

+    %define p_shufb_lut r5

+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))

+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]

+%else

+    %assign push_num 0

+    %define r_mask  rax

+    %define r_maskd eax

+    %define p_shufb_lut r5

+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))

+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]

+%endif

+    LOAD_5_PARA

+    PUSH_XMM 2

+    ; Free up rcx/ecx because only cl is accepted as shift amount operand.

+%ifidni r0b, cl

+    mov             r6, r0

+    %undef p_coeff_level

+    %define p_coeff_level r6

+    %define r_tmp r0

+    %define r_tmpd r0d

+    %define r_tmpb r0b

+%elifidni r1b, cl

+    mov             r6, r1

+    %undef p_run

+    %define p_run r6

+    %define r_tmp r1

+    %define r_tmpd r1d

+    %define r_tmpb r1b

+%elifidni r3b, cl

+    mov             r6, r3

+    %undef p_total_coeffs

+    %define p_total_coeffs r6

+    %define r_tmp r3

+    %define r_tmpd r3d

+    %define r_tmpb r3b

+%else

+    %error "Unknown cl register."

+%endif

+    ; Acquire a bitmask indicating which words are non-zero.

+    ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.

+    ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.

+    ; Assumptions are taken from previous implementations.

+    pxor            xmm1, xmm1

+    cmp             i_endidxd, 3

+    jg              .load16

+    movq            xmm0, [p_coeff_level]

+    packsswb        xmm0, xmm1

+    jmp             .load_done

+.load16:

+    movdqa          xmm0, [p_coeff_level]

+    packsswb        xmm0, [p_coeff_level + 16]

+.load_done:

+    movdqa          [p_run], xmm1                           ; Zero-initialize because we may read back implied zeros.

+    pcmpeqb         xmm0, xmm1

+    pshufb          xmm0, [wels_shufb_rev]

+    pmovmskb        r_maskd, xmm0

+    xor             r_maskd, 0FFFFh

+    mov             r_tmpd, i_endidxd

+%undef i_endidxd

+%define r_tmp2  r4

+%define r_tmp2d r4d

+    popcnt          r_tmp2d, r_maskd

+    mov             [p_total_coeffs], r_tmp2d

+%xdefine i_total_zeros p_total_coeffs

+%undef p_total_coeffs

+    mov             i_total_zeros, r_tmp2

+    jz              .done

+    mov             i_total_zeros, 16

+    sub             i_total_zeros, r_tmp2

+    bsf             r_tmpd, r_maskd                         ; Find first set bit.

+    sub             i_total_zeros, r_tmp

+    ; Skip trailing zeros.

+    ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.

+    and             r_tmpd, -4

+    shr             r_maskd, r_tmpb

+    add             r_tmpd, r_tmpd

+    sub             p_coeff_level, r_tmp

+    ; Handle first quadruple containing a non-zero value.

+    mov             r_tmp, r_mask

+    and             r_tmpd, 0Fh

+    movq            xmm0, [p_coeff_level + 24]

+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]

+    pshufb          xmm0, xmm1

+    mov             r_tmp2d, [p_run_lut + 4 * r_tmp]

+    shr             r_tmp2d, 8                              ; Skip initial zero run.

+    movlps          [p_level], xmm0                         ; Store levels for the first quadruple.

+    mov             [p_run], r_tmp2d                        ; Store accompanying zero runs thus far.

+    shr             r_maskd, 4

+    jz              .done

+.loop:

+    ; Increment pointers.

+    popcnt          r_tmpd, r_tmpd                          ; Number of non-zero values handled.

+    lea             p_level, [p_level + 2 * r_tmp]

+    add             p_run, r_tmp

+    ; Handle next quadruple.

+    mov             r_tmp, r_mask

+    and             r_tmpd, 0Fh

+    movq            xmm0, [p_coeff_level + 16]

+    sub             p_coeff_level, 8

+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]

+    pshufb          xmm0, xmm1

+    movzx           r_tmp2d, byte [p_run - 1]

+    add             r_tmp2d, [p_run_lut + 4 * r_tmp]        ; Add to previous run and get eventual new runs.

+    movlps          [p_level], xmm0                         ; Store levels (potentially none).

+    mov             [p_run - 1], r_tmp2d                    ; Update previous run and store eventual new runs.

+    shr             r_maskd, 4

+    jnz             .loop

+.done:

+    mov             retrq, i_total_zeros

+    POP_XMM

+    LOAD_5_PARA_POP

+%ifdef X86_32

+    pop             r6

+    pop             r5

+%elifdef WIN64

+    pop             rbx

+%endif

+    ret

+%undef p_coeff_level

+%undef p_run

+%undef p_level

+%undef i_total_zeros

+%undef r_mask

+%undef r_maskd

+%undef r_tmp

+%undef r_tmpd

+%undef r_tmpb

+%undef r_tmp2

+%undef r_tmp2d

+%undef p_shufb_lut

+%undef p_run_lut

--- a/test/encoder/EncUT_Cavlc.cpp

+++ b/test/encoder/EncUT_Cavlc.cpp

@@ -81,3 +81,10 @@

   TestCavlcParamCal (CavlcParamCal_sse2);

 #endif

+#ifdef X86_ASM

+TEST (CavlcTest, CavlcParamCal_sse42) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)

+    TestCavlcParamCal (CavlcParamCal_sse42);

+}

+#endif