shithub: openh264

Download patch

ref: b35f5797de9d23878fe8d4178da05061e770685c
parent: 9d2e1a9384dc680758700b187485be1f30a8baed
author: zhiliang wang <[email protected]>
date: Thu Aug 14 14:41:52 EDT 2014

Add x86 32/64bit asm code for Scc_hash

--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -252,6 +252,10 @@
 #ifdef X86_ASM
 extern "C"
 {
+void InitializeHashforFeature_sse2 (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+                                     uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+void FillQpelLocationByFeatureValue_sse2 (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+                                           uint16_t** pFeatureValuePointerList);
 int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
 int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
 void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -107,6 +107,8 @@
 #if defined (X86_ASM)
     if (uiCpuFlag & WELS_CPU_SSE2) {
         //for feature search
+      pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_sse2;
+      pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_sse2;
       pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
       pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
         //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -31,6 +31,16 @@
 ;*************************************************************************/
 %include "asm_inc.asm"
 
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+SECTION .rodata align=16
+
+ALIGN 16
+mv_x_inc_x4		dw	0x10, 0x10, 0x10, 0x10
+mv_y_inc_x4		dw	0x04, 0x04, 0x04, 0x04
+mx_x_offset_x4	dw	0x00, 0x04, 0x08, 0x0C
+
 SECTION .text
 %ifdef X86_32
 ;**********************************************************************************************************************
@@ -661,6 +671,159 @@
 %undef		tmp_width
     ret
 
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+;-----------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN FillQpelLocationByFeatureValue_sse2
+	push	esi
+	push	edi
+	push	ebx
+	push	ebp
+
+	%define _ps			16				; push size
+	%define	_ls			4				; local size
+	%define	sum_ref		esp+_ps+_ls+4
+	%define	pos_list	esp+_ps+_ls+16
+	%define width		esp+_ps+_ls+8
+	%define height		esp+_ps+_ls+12
+	%define	i_height	esp
+	sub		esp,	_ls
+
+	mov		esi,	[sum_ref]
+	mov		edi,	[pos_list]
+	mov		ebp,	[width]
+	mov		ebx,	[height]
+	mov		[i_height],	ebx
+
+	movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
+	movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
+	movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
+	pxor	xmm4,	xmm4
+	pxor	xmm3,	xmm3				; y_qpel vector
+HASH_HEIGHT_LOOP_SSE2:
+	movdqa	xmm2,	xmm5	; x_qpel vector
+	mov		ecx,	ebp
+HASH_WIDTH_LOOP_SSE2:
+	movq	xmm0,	[esi]			; load x8 sum
+	punpcklwd	xmm0,	xmm4
+	movdqa		xmm1,	xmm2
+	punpcklwd	xmm1,	xmm3
+%rep	3
+	movd	edx,	xmm0
+	lea		ebx,	[edi+edx*4]
+	mov		eax,	[ebx]
+	movd	[eax],	xmm1
+	mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
+	lea		eax,	[eax+4]
+	mov		[ebx],	eax
+	psrldq	xmm1,	4
+	psrldq	xmm0,	4
+%endrep
+	movd	edx,	xmm0
+	lea		ebx,	[edi+edx*4]
+	mov		eax,	[ebx]
+	movd	[eax],	xmm1
+	mov		edx,	[eax+4]	; explictly load eax+4 due cache miss from vtune observation
+	lea		eax,	[eax+4]
+	mov		[ebx],	eax
+
+	paddw	xmm2,	xmm7
+	lea		esi,	[esi+8]
+	sub		ecx,	4
+	jnz near HASH_WIDTH_LOOP_SSE2
+	paddw	xmm3,	xmm6
+	dec	dword [i_height]
+	jnz	near HASH_HEIGHT_LOOP_SSE2
+
+	add		esp,	_ls
+	%undef	_ps
+	%undef	_ls
+	%undef	sum_ref
+	%undef	pos_list
+	%undef	width
+	%undef	height
+	%undef	i_height
+	pop		ebp
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+;                        uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN InitializeHashforFeature_sse2
+	push	ebx
+	push	esi
+	push	edi
+	push	ebp
+	%define	_ps	16	; push size
+	mov		edi,	[esp+_ps+16]	; pPositionOfSum
+	mov		ebp,	[esp+_ps+20]	; sum_idx_list
+	mov		esi,	[esp+_ps+4]     ; pTimesOfSum
+	mov		ebx,	[esp+_ps+8]     ; pBuf
+	mov		edx,	[esp+_ps+12]	; list_sz
+	sar		edx,	2
+	mov		ecx,	0
+	pxor	xmm7,	xmm7
+hash_assign_loop_x4_sse2:
+	movdqa	xmm0,	[esi+ecx]
+	pslld	xmm0,	2
+
+	movdqa	xmm1,	xmm0
+	pcmpeqd	xmm1,	xmm7
+	movmskps	eax,	xmm1
+    cmp eax, 0x0f
+	je	near hash_assign_with_copy_sse2
+
+%assign x	0
+%rep 4
+	lea		eax,	[edi+ecx+x]
+	mov		[eax],	ebx
+	lea		eax,	[ebp+ecx+x]
+	mov		[eax],	ebx
+	movd	eax,	xmm0
+	add		ebx,	eax
+	psrldq	xmm0,	4
+%assign	x	x+4
+%endrep
+	jmp near assign_next_sse2
+
+hash_assign_with_copy_sse2:
+	movd	xmm1,	ebx
+	pshufd	xmm2,	xmm1,	0
+	movdqa	[edi+ecx], xmm2
+	movdqa	[ebp+ecx], xmm2
+
+assign_next_sse2:
+	add		ecx,	16
+	dec		edx
+	jnz		near hash_assign_loop_x4_sse2
+
+	mov		edx,	[esp+_ps+12]	; list_sz
+	and		edx,	3
+	jz		near hash_assign_no_rem_sse2
+hash_assign_loop_x4_rem_sse2:
+	lea		eax,	[edi+ecx]
+	mov		[eax],	ebx
+	lea		eax,	[ebp+ecx]
+	mov		[eax],	ebx
+	mov		eax,	[esi+ecx]
+	sal		eax,	2
+	add		ebx,	eax
+	add		ecx,	4
+	dec		edx
+	jnz		near hash_assign_loop_x4_rem_sse2
+
+hash_assign_no_rem_sse2:
+	%undef	_ps
+	pop		ebp
+	pop		edi
+	pop		esi
+	pop		ebx
+	ret
 %else
 
 ;**********************************************************************************************************************
@@ -1221,6 +1384,146 @@
     POP_XMM
     LOAD_6_PARA_POP
     ret
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
+;-----------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN FillQpelLocationByFeatureValue_sse2
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r2, r2d
+    push r12
+    push r13
+
+	;mov		esi,	[sum_ref]   r0:esi
+	;mov		edi,	[pos_list]  r3:edi
+	;mov		ebp,	[width]     r1:ebp
+	;mov		ebx,	[height]    r2:ebx
+	;mov		[i_height],	ebx
+    mov     r12,    r2
+
+	movq	xmm7,	[mv_x_inc_x4]		; x_qpel inc
+	movq	xmm6,	[mv_y_inc_x4]		; y_qpel inc
+	movq	xmm5,	[mx_x_offset_x4]	; x_qpel vector
+	pxor	xmm4,	xmm4
+	pxor	xmm3,	xmm3				; y_qpel vector
+HASH_HEIGHT_LOOP_SSE2:
+	movdqa	xmm2,	xmm5	; x_qpel vector
+	mov		r4,	r1
+HASH_WIDTH_LOOP_SSE2:
+	movq	xmm0,	[r0]			; load x8 sum
+	punpcklwd	xmm0,	xmm4
+	movdqa		xmm1,	xmm2
+	punpcklwd	xmm1,	xmm3
+%rep	3
+	movd	r2d,	xmm0        ;edx:r3
+	lea		r5,     [r3+r2*8]   ;ebx:r5
+	mov		r6,     [r5]        ;eax:r6
+	movd	[r6],	xmm1
+	mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
+	lea		r6,     [r6+4]
+	mov		[r5],	r6
+	psrldq	xmm1,	4
+	psrldq	xmm0,	4
+%endrep
+	movd	r2d,	xmm0
+	lea		r5,     [r3+r2*8]   ;ebx:r5
+	mov		r6,     [r5]        ;eax:r6
+	movd	[r6],	xmm1
+	mov		r13,    [r6+4]	; explictly load eax+4 due cache miss from vtune observation
+	lea		r6,     [r6+4]
+	mov		[r5],	r6
+
+	paddw	xmm2,	xmm7
+	lea		r0,     [r0+8]
+	sub		r4,     4
+	jnz near HASH_WIDTH_LOOP_SSE2
+	paddw	xmm3,	xmm6
+	dec	r12
+	jnz	near HASH_HEIGHT_LOOP_SSE2
+
+	pop		r13
+	pop		r12
+    POP_XMM
+	ret
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+;                                 uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz )
+;---------------------------------------------------------------------------------------------------------------------------------------------------
+WELS_EXTERN InitializeHashforFeature_sse2
+    %assign  push_num 0
+    LOAD_5_PARA
+    SIGN_EXTENSION  r2, r2d
+    push r12
+    push r13
+	;mov		edi,	[esp+_ps+4]		; pPositionOfSum    r3:edi
+	;mov		ebp,	[esp+_ps+8]		; sum_idx_list      r4:ebp
+	;mov		esi,	[esp+_ps+12]	; pTimesOfSum       r0:esi
+	;mov		ebx,	[esp+_ps+16]	; pBuf              r1:ebx
+	;mov		edx,	[esp+_ps+20]	; list_sz           r2:edx
+    mov     r12,    r2
+	sar		r2,     2
+	mov		r5,     0       ;r5:ecx
+    xor     r6,     r6
+	pxor	xmm3,	xmm3
+hash_assign_loop_x4_sse2:
+	movdqa	xmm0,	[r0+r5]
+	pslld	xmm0,	2
+
+	movdqa	xmm1,	xmm0
+	pcmpeqd	xmm1,	xmm3
+	movmskps	r6,	xmm1
+    cmp         r6, 0x0f
+	jz	near hash_assign_with_copy_sse2
+
+%assign x	0
+%rep 4
+	lea		r13,	[r3+r5*2+x]
+	mov		[r13],	r1
+	lea		r13,	[r4+r5*2+x]
+	mov		[r13],	r1
+	movd	r6d,	xmm0
+	add		r1,     r6
+	psrldq	xmm0,	4
+%assign	x	x+8
+%endrep
+	jmp near assign_next_sse2
+
+hash_assign_with_copy_sse2:
+	movq	xmm1,	r1
+	pshufd	xmm2,	xmm1,	01000100b
+	movdqa	[r3+r5*2], xmm2
+	movdqa	[r4+r5*2], xmm2
+	movdqa	[r3+r5*2+16], xmm2
+	movdqa	[r4+r5*2+16], xmm2
+
+assign_next_sse2:
+	add		r5,	16
+	dec		r2
+	jnz		near hash_assign_loop_x4_sse2
+
+	and		r12,	3
+	jz		near hash_assign_no_rem_sse2
+hash_assign_loop_x4_rem_sse2:
+	lea		r13,	[r3+r5*2]
+	mov		[r13],	r1
+	lea		r13,	[r4+r5*2]
+	mov		[r13],	r1
+	mov		r6d,	[r0+r5]
+	sal		r6,     2
+	add		r1,     r6
+	add		r5,     4
+	dec		r12
+	jnz		near hash_assign_loop_x4_rem_sse2
+
+hash_assign_no_rem_sse2:
+    pop     r13
+    pop		r12
+	ret
 
 %endif
 
--- a/test/encoder/EncUT_SVC_me.cpp
+++ b/test/encoder/EncUT_SVC_me.cpp
@@ -6,6 +6,7 @@
 #include "cpu_core.h"
 #include "cpu.h"
 #include "macros.h"
+#include "ls_defines.h"
 #include "svc_motion_estimate.h"
 
 using namespace WelsEnc;
@@ -77,6 +78,33 @@
   }
 }
 
+
+void InitializeHashforFeature_ref (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+                                 uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList) {
+    //assign location pointer
+  uint16_t* pBufPos  = pBuf;
+  for (int32_t i = 0 ; i < kiListSize; ++i) {
+    pLocationOfFeature[i] =
+    pFeatureValuePointerList[i] = pBufPos;
+    pBufPos      += (pTimesOfFeatureValue[i] << 1);
+  }
+}
+void FillQpelLocationByFeatureValue_ref (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+                                       uint16_t** pFeatureValuePointerList) {
+    //assign each pixel's position
+  uint16_t* pSrcPointer  =  pFeatureOfBlock;
+  int32_t iQpelY = 0;
+  for (int32_t y = 0; y < kiHeight; y++) {
+    for (int32_t x = 0; x < kiWidth; x++) {
+      uint16_t uiFeature = pSrcPointer[x];
+      ST32 (&pFeatureValuePointerList[uiFeature][0], ((iQpelY << 16) | (x << 2)));
+      pFeatureValuePointerList[uiFeature] += 2;
+    }
+    iQpelY += 4;
+    pSrcPointer += kiWidth;
+  }
+}
+
 #define GENERATE_SumOfSingleBlock(anchor, method) \
 TEST (SVC_ME_FunTest, method) {\
   ENFORCE_STACK_ALIGN_1D (uint8_t,  uiRefBuf,   16*320, 16);\
@@ -135,6 +163,89 @@
 delete[] pFeatureOfBlockBuff1; \
 delete[] pFeatureOfBlockBuff2; \
 }
+
+#define GENERATE_InitializeHashforFeature(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
+  FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
+  memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
+  memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
+  SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
+  int32_t iActSize = 65536;\
+  anchor ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0);\
+  method ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature1, pFeaturePointValueList1); \
+  for(int32_t j =0; j<65536; j++) { \
+    EXPECT_EQ (pLocationFeature0[j], pLocationFeature1[j]); \
+    EXPECT_EQ (pFeaturePointValueList0[j], pFeaturePointValueList1[j]); \
+  } \
+} \
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff; \
+delete[] pLocationBuff1; \
+delete[] pTimesOfFeatureValueBuff; \
+delete[] pLocationFeature0Buff; \
+delete[] pFeaturePointValueList0Buff; \
+delete[] pLocationFeature1Buff; \
+delete[] pFeaturePointValueList1Buff; \
+}
+
+
+#define GENERATE_FillQpelLocationByFeatureValue(anchor, method, kiWidth, kiHeight) \
+TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
+ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock, pFeatureOfBlockBuff, (kiWidth*kiHeight), 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation1, pLocationBuff1, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t, pLocation2, pLocationBuff2, (kiWidth*kiHeight)*2, 16) \
+ENFORCE_NEW_ALIGN_1D (uint32_t, pTimesOfFeatureValue, pTimesOfFeatureValueBuff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature0, pLocationFeature0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pLocationFeature1, pLocationFeature1Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList0, pFeaturePointValueList0Buff, 65536, 16) \
+ENFORCE_NEW_ALIGN_1D (uint16_t*, pFeaturePointValueList1, pFeaturePointValueList1Buff, 65536, 16) \
+for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) { \
+  FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)); \
+  memset(pTimesOfFeatureValue, 0, 65536*sizeof(uint32_t)); \
+  memset(pLocationFeature0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList0, 0, 65536*sizeof(uint16_t*)); \
+  memset(pLocationFeature1, 0, 65536*sizeof(uint16_t*)); \
+  memset(pFeaturePointValueList1, 0, 65536*sizeof(uint16_t*)); \
+  SumOf8x8BlockOfFrame_c (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock,pTimesOfFeatureValue); \
+  int32_t iActSize = 65536; \
+  InitializeHashforFeature_c ( pTimesOfFeatureValue, pLocation1, iActSize, pLocationFeature0, pFeaturePointValueList0); \
+  InitializeHashforFeature_c( pTimesOfFeatureValue, pLocation2, iActSize, pLocationFeature1, pFeaturePointValueList1); \
+  anchor(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList0); \
+  method(pFeatureOfBlock, kiWidth, kiHeight, pFeaturePointValueList1); \
+  for(int32_t j =0; j<kiWidth*kiHeight*2; j++) { \
+    EXPECT_EQ (pLocation1[j], pLocation2[j]); \
+  } \
+} \
+delete[] pRefPictureBuff; \
+delete[] pFeatureOfBlockBuff; \
+delete[] pLocationBuff1; \
+delete[] pLocationBuff2; \
+delete[] pTimesOfFeatureValueBuff; \
+delete[] pLocationFeature0Buff; \
+delete[] pFeaturePointValueList0Buff; \
+delete[] pLocationFeature1Buff; \
+delete[] pFeaturePointValueList1Buff; \
+}
+
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_c, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_c, 16, 16)
+#ifdef X86_ASM
+GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_sse2, 10, 10)
+GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_sse2, 16, 16)
+#endif
 
 GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)