shithub: openh264

--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

@@ -45,6 +45,8 @@

 		4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446F818BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp */; };

 		4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446FA18BC605C0017DF25 /* wels_preprocess.cpp */; };

 		4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; };

+		6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */; };

+		6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */; };

 		9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; };

 		9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; };

 		F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; };

@@ -154,6 +156,8 @@

 		4CE446FE18BC605C0017DF25 /* welsEncoderExt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = welsEncoderExt.h; sourceTree = "<group>"; };

 		4CE4470418BC605C0017DF25 /* wels_enc_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_enc_export.def; sourceTree = "<group>"; };

 		4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsEncoderExt.cpp; sourceTree = "<group>"; };

+		6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = svc_motion_estimation.S; sourceTree = "<group>"; };

+		6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = svc_motion_estimation_aarch64_neon.S; path = arm64/svc_motion_estimation_aarch64_neon.S; sourceTree = "<group>"; };

 		9AED664819469FAF009A3567 /* welsCodecTrace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = welsCodecTrace.h; path = ../../../common/inc/welsCodecTrace.h; sourceTree = "<group>"; };

 		9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };

 		9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; };

@@ -177,6 +181,7 @@

 		4C34066418C57D0400DFA14A /* arm */ = {

 			isa = PBXGroup;

 			children = (

+				6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */,

 				4C34066618C57D0400DFA14A /* intra_pred_neon.S */,

 				4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,

 				4C34066918C57D0400DFA14A /* memory_neon.S */,

@@ -189,6 +194,7 @@

 		4CB8F2B219235FAC005D6386 /* arm64 */ = {

 			isa = PBXGroup;

 			children = (

+				6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */,

 				F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */,

 				F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */,

 				4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,

@@ -423,6 +429,7 @@

 				4CE4471D18BC605C0017DF25 /* property.cpp in Sources */,

 				4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,

 				4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,

+				6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */,

 				4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,

 				4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,

 				9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */,

@@ -455,6 +462,7 @@

 				4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */,

 				4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */,

 				4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */,

+				6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */,

 				4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */,

 				4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */,

);

--- /dev/null

+++ b/codec/encoder/core/arm/svc_motion_estimation.S

@@ -1,0 +1,168 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef  HAVE_NEON

+.text

+#include "arm_arch_common_macro.S"

+WELS_ASM_FUNC_BEGIN SumOf8x8SingleBlock_neon

+    vld1.64 {d0}, [r0], r1

+    vld1.64 {d1}, [r0], r1

+    vld1.64 {d2}, [r0], r1

+    vld1.64 {d3}, [r0], r1

+    vld1.64 {d4}, [r0], r1

+    vld1.64 {d5}, [r0], r1

+    vld1.64 {d6}, [r0], r1

+    vld1.64 {d7}, [r0]

+    vpaddl.u8 q0, q0

+    vpadal.u8 q0, q1

+    vpadal.u8 q0, q2

+    vpadal.u8 q0, q3

+    vpaddl.u16 q0, q0

+    vpadd.i32 d0, d1

+    vpadd.i32 d0, d0

+    vmov    r0, r1, d0

+WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN SumOf16x16SingleBlock_neon

+    vld1.64 {q0}, [r0], r1

+    vpaddl.u8 q0, q0

+.rept 15

+    vld1.64 {q1}, [r0], r1

+    vpadal.u8 q0, q1

+.endr

+    vpaddl.u16 q0, q0

+    vpadd.i32 d0, d1

+    vpadd.i32 d0, d0

+    vmov    r0, r1, d0

+WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon

+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])

+    stmdb sp!, {r4-r8}

+    ldr	r5, [sp, #24] //pTimesOfFeatureValue

+    ldr	r4, [sp, #20] //pFeatureOfBlock

+    mov r8, r0

+    mov r6, r1

+    add r8, r6

+    add r4, r6, lsl #1

+_height_loop8x8:

+    mov r7, r6

+_width_loop8x8:

+    subs r0, r8, r7

+    vld1.64 {d0}, [r0], r3

+    vld1.64 {d1}, [r0], r3

+    vld1.64 {d2}, [r0], r3

+    vld1.64 {d3}, [r0], r3

+    vld1.64 {d4}, [r0], r3

+    vld1.64 {d5}, [r0], r3

+    vld1.64 {d6}, [r0], r3

+    vld1.64 {d7}, [r0]

+    vpaddl.u8 q0, q0

+    vpadal.u8 q0, q1

+    vpadal.u8 q0, q2

+    vpadal.u8 q0, q3

+    vpaddl.u16 q0, q0

+    vpadd.i32 d0, d1

+    vpadd.i32 d0, d0

+    subs r1, r4, r7, lsl #1

+    vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]

+    vmov    r0, r1, d0

+    add r1, r5, r0, lsl #2

+    ldr r0, [r1]

+    add r0, #1

+    str r0, [r1]

+    subs r7, #1

+    bne _width_loop8x8

+    add r8, r3

+    add r4, r6, lsl #1

+    subs r2, #1

+    bne _height_loop8x8

+    ldmia sp!, {r4-r8}

+WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon

+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])

+    stmdb sp!, {r4-r8}

+    ldr	r5, [sp, #24] //pTimesOfFeatureValue

+    ldr	r4, [sp, #20] //pFeatureOfBlock

+    mov r8, r0

+    mov r6, r1

+    add r8, r6

+    add r4, r6, lsl #1

+_height_loop16x16:

+    mov r7, r6

+_width_loop16x16:

+    subs r0, r8, r7

+    vld1.64 {q0}, [r0], r3

+    vpaddl.u8 q0, q0

+.rept 15

+    vld1.64 {q1}, [r0], r3

+    vpadal.u8 q0, q1

+.endr

+    vpaddl.u16 q0, q0

+    vpadd.i32 d0, d1

+    vpadd.i32 d0, d0

+    subs r1, r4, r7, lsl #1

+    vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]

+    vmov    r0, r1, d0

+    add r1, r5, r0, lsl #2

+    ldr r0, [r1]

+    add r0, #1

+    str r0, [r1]

+    subs r7, #1

+    bne _width_loop16x16

+    add r8, r3

+    add r4, r6, lsl #1

+    subs r2, #1

+    bne _height_loop16x16

+    ldmia sp!, {r4-r8}

+WELS_ASM_FUNC_END

+#endif

\ No newline at end of file

--- /dev/null

+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S

@@ -1,0 +1,151 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef  HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon

+    ld1 {v0.d}[0], [x0], x1

+    ld1 {v0.d}[1], [x0], x1

+    ld1 {v1.d}[0], [x0], x1

+    ld1 {v1.d}[1], [x0], x1

+    ld1 {v2.d}[0], [x0], x1

+    ld1 {v2.d}[1], [x0], x1

+    ld1 {v3.d}[0], [x0], x1

+    ld1 {v3.d}[1], [x0]

+    uaddlp v0.8h, v0.16b

+    uadalp v0.8h, v1.16b

+    uadalp v0.8h, v2.16b

+    uadalp v0.8h, v3.16b

+    uaddlv s0, v0.8h

+    mov    x0, v0.d[0]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16SingleBlock_AArch64_neon

+    ld1 {v0.16b}, [x0], x1

+    uaddlp v0.8h, v0.16b

+.rept 15

+    ld1 {v1.16b}, [x0], x1

+    uadalp v0.8h, v1.16b

+.endr

+    uaddlv s0, v0.8h

+    mov    x0, v0.d[0]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon

+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])

+    //x5: pTimesOfFeatureValue

+    //x4: pFeatureOfBlock

+    mov x8, x0

+    mov x6, x1

+    add x8, x8, x6

+    add x4, x4, x6, lsl #1

+_height_loop8x8:

+    mov x7, x6

+_width_loop8x8:

+    subs x0, x8, x7

+    ld1 {v0.d}[0], [x0], x3

+    ld1 {v0.d}[1], [x0], x3

+    ld1 {v1.d}[0], [x0], x3

+    ld1 {v1.d}[1], [x0], x3

+    ld1 {v2.d}[0], [x0], x3

+    ld1 {v2.d}[1], [x0], x3

+    ld1 {v3.d}[0], [x0], x3

+    ld1 {v3.d}[1], [x0]

+    uaddlp v0.8h, v0.16b

+    uadalp v0.8h, v1.16b

+    uadalp v0.8h, v2.16b

+    uadalp v0.8h, v3.16b

+    uaddlv s0, v0.8h

+    subs x1, x4, x7, lsl #1

+    st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]

+    mov w0, #0

+    ins v0.s[1], w0

+    mov    x0, v0.d[0]

+    add x1, x5, x0, lsl #2

+    ldr w0, [x1]

+    add w0, w0, #1

+    str w0, [x1]

+    subs x7, x7, #1

+    cbnz x7, _width_loop8x8

+    add x8, x8, x3

+    add x4, x4, x6, lsl #1

+    subs x2, x2, #1

+    cbnz x2, _height_loop8x8

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon

+//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])

+    //x5: pTimesOfFeatureValue

+    //x4: pFeatureOfBlock

+    mov x8, x0

+    mov x6, x1

+    add x8, x8, x6

+    add x4, x4, x6, lsl #1

+_height_loop16x16:

+    mov x7, x6

+_width_loop16x16:

+    subs x0, x8, x7

+    ld1 {v0.16b}, [x0], x3

+    uaddlp v0.8h, v0.16b

+.rept 15

+    ld1 {v1.16b}, [x0], x3

+    uadalp v0.8h, v1.16b

+.endr

+    uaddlv s0, v0.8h

+    subs x1, x4, x7, lsl #1

+    st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]

+    mov w0, #0

+    ins v0.s[1], w0

+    mov    x0, v0.d[0]

+    add x1, x5, x0, lsl #2

+    ldr w0, [x1]

+    add w0, w0, #1

+    str w0, [x1]

+    subs x7, x7, #1

+    cbnz x7, _width_loop16x16

+    add x8, x8, x3

+    add x4, x4, x6, lsl #1

+    subs x2, x2, #1

+    cbnz x2, _height_loop16x16

+WELS_ASM_AARCH64_FUNC_END

+#endif

\ No newline at end of file

--- a/codec/encoder/core/inc/svc_motion_estimate.h

+++ b/codec/encoder/core/inc/svc_motion_estimate.h

@@ -244,6 +244,33 @@

 void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

                                const int32_t kiRefStride,

                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);

+#ifdef HAVE_NEON

+extern "C"

+{

+int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);

+int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);

+void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

+                                const int32_t kiRefStride,

+                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);

+void SumOf16x16BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

+                                  const int32_t kiRefStride,

+                                  uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);

+}

+#endif

+#ifdef HAVE_NEON_AARCH64

+extern "C"

+{

+int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);

+int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);

+void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

+                                const int32_t kiRefStride,

+                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);

+void SumOf16x16BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

+                                  const int32_t kiRefStride,

+                                  uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);

+}

+#endif

 int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth,  const int32_t kiFrameHeight,

     const int32_t iNeedFeatureStorage,

     SScreenBlockFeatureStorage* pScreenBlockFeatureStorage);

--- a/codec/encoder/core/src/svc_motion_estimate.cpp

+++ b/codec/encoder/core/src/svc_motion_estimate.cpp

@@ -102,6 +102,23 @@

     //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

     pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;

     pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;

+#if defined (HAVE_NEON)

+    //for feature search

+    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;

+    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;

+    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

+    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;

+    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;

+#endif

+#if defined (HAVE_NEON_AARCH64)

+    //for feature search

+    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;

+    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;

+    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

+    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;

+    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;

+#endif

--- a/codec/encoder/targets.mk

+++ b/codec/encoder/targets.mk

@@ -53,6 +53,7 @@

 	$(ENCODER_SRCDIR)/core/arm/memory_neon.S\

 	$(ENCODER_SRCDIR)/core/arm/pixel_neon.S\

 	$(ENCODER_SRCDIR)/core/arm/reconstruct_neon.S\

+	$(ENCODER_SRCDIR)/core/arm/svc_motion_estimation.S\

 ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ))

 endif

@@ -64,6 +65,7 @@

 	$(ENCODER_SRCDIR)/core/arm64/memory_aarch64_neon.S\

 	$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\

 	$(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\

+	$(ENCODER_SRCDIR)/core/arm64/svc_motion_estimation_aarch64_neon.S\

 ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))

 endif

--- /dev/null

+++ b/test/encoder/EncUT_SVC_me.cpp

@@ -1,0 +1,157 @@

+#include <gtest/gtest.h>

+#include <math.h>

+#include <stdlib.h>

+#include <time.h>

+#include "cpu_core.h"

+#include "cpu.h"

+#include "macros.h"

+#include "svc_motion_estimate.h"

+using namespace WelsSVCEnc;

+#define SVC_ME_TEST_NUM 10

+static void FillWithRandomData (uint8_t* p, int32_t Len) {

+  for (int32_t i = 0; i < Len; i++) {

+    p[i] = rand() % 256;

+  }

+}

+//preprocess related

+int32_t SumOf8x8SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {

+  int32_t iSum = 0, i;

+  for (i = 0; i < 8; i++) {

+    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];

+    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];

+    pRef += kiRefStride;

+  }

+  return iSum;

+}

+int32_t SumOf16x16SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {

+  int32_t iSum = 0, i;

+  for (i = 0; i < 16; i++) {

+    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];

+    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];

+    iSum    +=  pRef[8]    + pRef[9]  + pRef[10]  + pRef[11];

+    iSum    +=  pRef[12]  + pRef[13]  + pRef[14]  + pRef[15];

+    pRef += kiRefStride;

+  }

+  return iSum;

+}

+void SumOf8x8BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

+                               const int32_t kiRefStride,

+                               uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {

+  int32_t x, y;

+  uint8_t* pRef;

+  uint16_t* pBuffer;

+  int32_t iSum;

+  for (y = 0; y < kiHeight; y++) {

+    pRef = pRefPicture  + kiRefStride * y;

+    pBuffer  = pFeatureOfBlock + kiWidth * y;

+    for (x = 0; x < kiWidth; x++) {

+      iSum = SumOf8x8SingleBlock_c (pRef + x, kiRefStride);

+      pBuffer[x] = iSum;

+      pTimesOfFeatureValue[iSum]++;

+    }

+  }

+}

+void SumOf16x16BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

+                                 const int32_t kiRefStride,

+                                 uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {

+  //TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able?

+  int32_t x, y;

+  uint8_t* pRef;

+  uint16_t* pBuffer;

+  int32_t iSum;

+  for (y = 0; y < kiHeight; y++) {

+    pRef = pRefPicture  + kiRefStride * y;

+    pBuffer  = pFeatureOfBlock + kiWidth * y;

+    for (x = 0; x < kiWidth; x++) {

+      iSum = SumOf16x16SingleBlock_c (pRef + x, kiRefStride);

+      pBuffer[x] = iSum;

+      pTimesOfFeatureValue[iSum]++;

+    }

+  }

+}

+#define GENERATE_SumOfSingleBlock(anchor, method) \

+TEST (SVC_ME_FunTest, method) {\

+  ENFORCE_STACK_ALIGN_1D (uint8_t,  uiRefBuf,   16*320, 16);\

+  int32_t iRes[2];\

+  for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\

+    FillWithRandomData (uiRefBuf,16*320);\

+    iRes[0] = anchor (uiRefBuf,320);\

+    iRes[1] = method (uiRefBuf,320);\

+    ASSERT_EQ (iRes[0], iRes[1]);\

+  }\

+}

+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)

+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)

+#ifdef HAVE_NEON

+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)

+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)

+#endif

+#ifdef HAVE_NEON_AARCH64

+GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_AArch64_neon)

+GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_AArch64_neon)

+#endif

+#define ENFORCE_NEW_ALIGN_1D(_tp, _nm, _nbuff, _sz, _al) \

+_tp *_nbuff = new _tp[(_sz)+(_al)-1]; \

+_tp *_nm = _nbuff + ((_al)-1) - (((uintptr_t)(_nbuff + ((_al)-1)) & ((_al)-1))/sizeof(_tp));

+#define GENERATE_SumOfFrame(anchor, method, kiWidth, kiHeight) \

+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\

+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \

+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock1, pFeatureOfBlockBuff1, (kiWidth*kiHeight), 16) \

+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock2, pFeatureOfBlockBuff2, (kiWidth*kiHeight), 16) \

+uint32_t pTimesOfFeatureValue[2][65536]; \

+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\

+  FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16));\

+  memset(pTimesOfFeatureValue[0], 0, 65536*sizeof(uint32_t)); \

+  memset(pTimesOfFeatureValue[1], 0, 65536*sizeof(uint32_t)); \

+  anchor (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock1,pTimesOfFeatureValue[0]); \

+  method (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock2,pTimesOfFeatureValue[1]); \

+  for(int32_t j=0;j<kiWidth*kiHeight;j++){\

+      ASSERT_EQ (pFeatureOfBlock1[j], pFeatureOfBlock2[j]);\

+  }\

+  for(int32_t  j=0;j<65536;j++){\

+      ASSERT_EQ (pTimesOfFeatureValue[0][j], pTimesOfFeatureValue[1][j]);\

+  }\

+}\

+delete[] pRefPictureBuff; \

+delete[] pFeatureOfBlockBuff1; \

+delete[] pFeatureOfBlockBuff2; \

+}

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 320)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 320)

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)

+#ifdef HAVE_NEON

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)

+#endif

+#ifdef HAVE_NEON_AARCH64

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 1)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 1)

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 320)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)

+GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)

+GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)

+#endif

--- a/test/encoder/targets.mk

+++ b/test/encoder/targets.mk

@@ -1,7 +1,7 @@

 ENCODER_UNITTEST_SRCDIR=test/encoder

 ENCODER_UNITTEST_CPP_SRCS=\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\

-	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\

+    $(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMbAux.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\

@@ -13,6 +13,7 @@

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\

+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\

 ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))