shithub: openh264

--- a/codec/encoder/core/arm/svc_motion_estimation.S

+++ b/codec/encoder/core/arm/svc_motion_estimation.S

@@ -235,4 +235,133 @@

 _SumOf16x16BlockOfFrame_neon_end:

     ldmia sp!, {r4-r12}

 WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon

+// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);

+    stmdb sp!, {r4-r7}

+    ldr	r4, [sp, #16] //pFeatureValuePointerList

+    bic r5, r2, #3

+_hash_assign_loop_x4:

+    vld1.64 {q0}, [r0]!

+    vshl.u32 q0, q0, #2

+    vceq.u32 q1, q0, #0

+    vand.i32 d2, d2, d3

+    vmov r6, r7, d2

+    and r6, r6, r7

+    cmp r6, #0xffffffff

+    beq _hash_assign_with_copy_x4

+    veor q1, q1

+    vext.32 q2, q1, q0, #3

+    vext.32 q3, q1, q0, #2

+    vext.32 q4, q1, q0, #1

+    vadd.u32 q0, q0, q2

+    vadd.u32 q0, q0, q3

+    vadd.u32 q0, q0, q4

+    vext.32 q2, q1, q0, #3

+    vdup.32  q3, r1

+    vadd.u32 q2, q2, q3

+    vst1.64 {q2}, [r3]!

+    vst1.64 {q2}, [r4]!

+    vmov.32 r6, d1[1]

+    add r1, r1, r6

+    b _assign_next

+_hash_assign_with_copy_x4:

+    vdup.32  q2, r1

+    vst1.64 {q2}, [r3]!

+    vst1.64 {q2}, [r4]!

+_assign_next:

+	subs r5, r5, #4

+	bne _hash_assign_loop_x4

+    and r5, r2, #3

+    cmp r5, #0

+    beq _hash_assign_end

+_hash_assign_loop_x4_rem:

+    str r1, [r3], #4

+    str r1, [r4], #4

+    ldr r7, [r0], #4

+    lsl r7, r7, #2

+    add r1, r1, r7

+    subs r5, r5, #1

+    bne _hash_assign_loop_x4_rem

+_hash_assign_end:

+    ldmia sp!, {r4-r7}

+WELS_ASM_FUNC_END

+.align 16

+mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00

+mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00

+mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00

+WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon

+// void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)

+    stmdb sp!, {r4-r8}

+    vpush		{q4-q7}

+    adr r7, mv_x_inc_x4

+    vld1.64 {q7}, [r7]

+    adr r7, mv_y_inc_x4

+    vld1.64 {q6}, [r7]

+    adr r7, mx_x_offset_x4

+    vld1.64 {q5}, [r7]

+    veor q4, q4

+    veor q3, q3

+    vdup.32 q8, r3

+_hash_height_loop:

+    mov r7, r1

+    vmov q2, q5 //mx_x_offset_x4

+_hash_width_loop:

+    vld1.64 {d0}, [r0]!

+    vshll.u16 q0, d0, #2

+    vadd.u32 q0, q8

+    vmov q1, q2

+    vmov q4, q3

+    vzip.16 q1, q4

+    vmov.32 r4, d0[0]

+    ldr r5, [r4]

+    vmov.32 r6, d2[0]

+    str r6, [r5]

+    add r5, r5, #4

+    pld [r5] // cache miss?

+    str r5, [r4]

+    vmov.32 r4, d0[1]

+    ldr r5, [r4]

+    vmov.32 r6, d2[1]

+    str r6, [r5]

+    add r5, r5, #4

+    pld [r5] // cache miss?

+    str r5, [r4]

+    vmov.32 r4, d1[0]

+    ldr r5, [r4]

+    vmov.32 r6, d3[0]

+    str r6, [r5]

+    add r5, r5, #4

+    pld [r5] // cache miss?

+    str r5, [r4]

+    vmov.32 r4, d1[1]

+    ldr r5, [r4]

+    vmov.32 r6, d3[1]

+    str r6, [r5]

+    add r5, r5, #4

+    pld [r5] // cache miss?

+    str r5, [r4]

+    vadd.u16 q2, q2, q7

+    subs r7, #4

+    bne _hash_width_loop

+    vadd.u16 q3, q3, q6

+    subs r2, #1

+    bne _hash_height_loop

+    vpop		{q4-q7}

+    ldmia sp!, {r4-r8}

+WELS_ASM_FUNC_END

 #endif

--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S

+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S

@@ -217,4 +217,121 @@

     cbnz x2, _height_loop16x16

 _SumOf16x16BlockOfFrame_AArch64_neon_end:

 WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon

+// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);

+    mov x9, #3

+    bic x5, x2, x9

+    mov x8, #0

+_hash_assign_loop_x4:

+    ld1 {v0.16b}, [x0], #16

+    shl v0.4s, v0.4s, #2

+    addv s1, v0.4s

+    umov w7, v1.s[0]

+    cbz w7, _hash_assign_with_copy_x4

+    ins v2.d[0], x1

+    umov w8, v0.s[0]

+    add x1, x1, x8

+    ins v2.d[1], x1

+    umov w8, v0.s[1]

+    add x1, x1, x8

+    ins v3.d[0], x1

+    umov w8, v0.s[2]

+    add x1, x1, x8

+    ins v3.d[1], x1

+    umov w8, v0.s[3]

+    add x1, x1, x8

+    st1 {v2.16b, v3.16b}, [x3], #32

+    st1 {v2.16b, v3.16b}, [x4], #32

+    b _assign_next

+_hash_assign_with_copy_x4:

+    dup  v2.2d, x1

+    dup  v3.2d, x1

+    st1 {v2.16b, v3.16b}, [x3], #32

+    st1 {v2.16b, v3.16b}, [x4], #32

+_assign_next:

+	subs x5, x5, #4

+	cbnz x5, _hash_assign_loop_x4

+    and x5, x2, x9

+    cbz x5, _hash_assign_end

+_hash_assign_loop_x4_rem:

+    str x1, [x3], #8

+    str x1, [x4], #8

+    ldr w8, [x0], #4

+    lsl w8, w8, #2

+    add x1, x1, x8

+    subs x5, x5, #1

+    cbnz x5, _hash_assign_loop_x4_rem

+_hash_assign_end:

+WELS_ASM_AARCH64_FUNC_END

+.align 16

+mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00

+mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00

+mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00

+WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon

+// void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)

+    ldr q7, mv_x_inc_x4

+    ldr q6, mv_y_inc_x4

+    ldr q5, mx_x_offset_x4

+    eor v4.16b, v4.16b, v4.16b

+    eor v3.16b, v3.16b, v3.16b

+    dup v16.2d, x3 // v8->v16

+_hash_height_loop:

+    mov x7, x1

+    mov.16b v2, v5 //mx_x_offset_x4

+_hash_width_loop:

+    ld1 {v0.d}[0], [x0], #8

+    ushll v0.4s, v0.4h, #3

+    uaddw   v17.2d, v16.2d, v0.2s

+    uaddw2  v18.2d, v16.2d, v0.4s

+    zip1 v1.8h, v2.8h, v3.8h

+    umov x4, v17.d[0]

+    ldr x5, [x4]

+    umov w6, v1.s[0]

+    str w6, [x5]

+    add x5, x5, #4

+    str x5, [x4]

+    umov x4, v17.d[1]

+    ldr x5, [x4]

+    umov w6, v1.s[1]

+    str w6, [x5]

+    add x5, x5, #4

+    str x5, [x4]

+    umov x4, v18.d[0]

+    ldr x5, [x4]

+    umov w6, v1.s[2]

+    str w6, [x5]

+    add x5, x5, #4

+    str x5, [x4]

+    umov x4, v18.d[1]

+    ldr x5, [x4]

+    umov w6, v1.s[3]

+    str w6, [x5]

+    add x5, x5, #4

+    str x5, [x4]

+    add v2.8h, v2.8h, v7.8h

+    subs x7, x7, #4

+    cbnz x7, _hash_width_loop

+    add v3.8h, v3.8h, v6.8h

+    subs x2, x2, #1

+    cbnz x2, _hash_height_loop

+WELS_ASM_AARCH64_FUNC_END

 #endif

\ No newline at end of file

--- a/codec/encoder/core/inc/svc_motion_estimate.h

+++ b/codec/encoder/core/inc/svc_motion_estimate.h

@@ -271,6 +271,10 @@

 #ifdef HAVE_NEON

 extern "C"

+void InitializeHashforFeature_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,

+                                    uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);

+void FillQpelLocationByFeatureValue_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,

+                                          uint16_t** pFeatureValuePointerList);

 int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);

 int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);

 void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

@@ -285,6 +289,10 @@

 #ifdef HAVE_NEON_AARCH64

 extern "C"

+void InitializeHashforFeature_AArch64_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,

+                                    uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);

+void FillQpelLocationByFeatureValue_AArch64_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,

+                                          uint16_t** pFeatureValuePointerList);

 int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);

 int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);

 void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

--- a/codec/encoder/core/src/svc_motion_estimate.cpp

+++ b/codec/encoder/core/src/svc_motion_estimate.cpp

@@ -125,6 +125,8 @@

 #if defined (HAVE_NEON)

     if (uiCpuFlag & WELS_CPU_NEON) {

       //for feature search

+      pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon;

+      pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon;

       pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;

       pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;

       //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

@@ -136,6 +138,8 @@

 #if defined (HAVE_NEON_AARCH64)

     if (uiCpuFlag & WELS_CPU_NEON) {

       //for feature search

+      pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon;

+      pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon;

       pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;

       pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;

       //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

--- a/test/encoder/EncUT_SVC_me.cpp

+++ b/test/encoder/EncUT_SVC_me.cpp

@@ -281,6 +281,10 @@

 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)

 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)

 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)

+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 10, 10)

+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 16, 16)

+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 640, 320)

+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 640, 320)

 #endif

 #ifdef HAVE_NEON_AARCH64

@@ -290,4 +294,8 @@

 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)

 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)

 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)

+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 10, 10)

+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 16, 16)

+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 640, 320)

+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 640, 320)

 #endif