shithub: openh264

Download patch

ref: 92bc88eacb6a7acc28963e60349860db0125221e
parent: 9da19758cf0d4b90f89cf8f1ba59493dfeda1c67
author: zhiliang wang <[email protected]>
date: Wed Nov 26 11:44:12 EST 2014

Add asm code for decoder cabac

--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -156,4 +156,22 @@
     vst1.32     {d22[0]},[r2],r1
     vst1.32     {d22[1]},[r2]
 WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsBlockZero16x16_neon
+    veor q0, q0
+    veor q1, q1
+    lsl r1, r1, 1
+.rept 16
+    vst1.64 {q0, q1}, [r0], r1
+.endr
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN WelsBlockZero8x8_neon
+    veor q0, q0
+    lsl r1, r1, 1
+.rept 8
+    vst1.64 {q0}, [r0], r1
+.endr
+WELS_ASM_FUNC_END
 #endif
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -158,4 +158,21 @@
     st1     {v1.s}[0],[x2],x1
     st1     {v1.s}[1],[x2]
 WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero16x16_AArch64_neon
+    eor v0.16b, v0.16b, v0.16b
+    eor v1.16b, v1.16b, v1.16b
+    lsl x1, x1, 1
+.rept 16
+    st1 {v0.16b, v1.16b}, [x0], x1
+.endr
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero8x8_AArch64_neon
+    eor v0.16b, v0.16b, v0.16b
+    lsl x1, x1, 1
+.rept 8
+    st1 {v0.16b}, [x0], x1
+.endr
+WELS_ASM_AARCH64_FUNC_END
 #endif
--- a/codec/decoder/core/inc/decode_slice.h
+++ b/codec/decoder/core/inc/decode_slice.h
@@ -67,12 +67,21 @@
 extern "C" {
 #endif//__cplusplus
 
+#if defined(X86_ASM)
+void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
+void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
+#endif
+
 #if defined(HAVE_NEON)
 void SetNonZeroCount_neon (int8_t* pNonZeroCount);
+void WelsBlockZero16x16_neon(int16_t * block, int32_t stride);
+void WelsBlockZero8x8_neon(int16_t * block, int32_t stride);
 #endif
 
 #if defined(HAVE_NEON_AARCH64)
 void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
+void WelsBlockZero16x16_AArch64_neon(int16_t * block, int32_t stride);
+void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
 #endif
 #ifdef __cplusplus
 }
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1644,15 +1644,25 @@
   //TO DO add neon and X86
 #ifdef	HAVE_NEON
   if (iCpu & WELS_CPU_NEON) {
-
+    pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_neon;
+    pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_neon;
   }
 #endif
 
 #ifdef	HAVE_NEON_AARCH64
   if (iCpu & WELS_CPU_NEON) {
+    pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_AArch64_neon;
+    pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_AArch64_neon;
+  }
+#endif
 
+#if defined(X86_ASM)
+  if (iCpu & WELS_CPU_SSE2) {
+    pFunc->pWelsBlockZero16x16Func	    = WelsBlockZero16x16_sse2;
+    pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_sse2;
   }
 #endif
+
 }
 
 void SetNonZeroCount_c (int8_t* pNonZeroCount) {
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -113,3 +113,30 @@
 
     emms
     ret
+
+;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
+WELS_EXTERN WelsBlockZero16x16_sse2
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    shl     r1, 1
+    pxor    xmm0, xmm0
+%rep 16
+    movdqa  [r0], xmm0
+    movdqa  [r0+16], xmm0
+    add     r0, r1
+%endrep
+    ret
+
+;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
+WELS_EXTERN WelsBlockZero8x8_sse2
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    shl     r1, 1
+    pxor    xmm0, xmm0
+%rep 8
+    movdqa  [r0], xmm0
+    add     r0, r1
+%endrep
+    ret