ref: a00e2e722926c9f5ca7ea964a8258a888f520e03
parent: 7d3bb19bede84d39b11e880de88f79a746d365f5
author: Martin Storsjö <[email protected]>
date: Mon Apr 27 09:47:07 EDT 2015
Convert tabs to spaces in sample_sc.asm This makes them consistent with the rest of the assembly source files. Prior to f2314151e8, all the assembly files had consistent indentation, but after that, this file had been made different.
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -37,9 +37,9 @@
SECTION .rodata align=16
ALIGN 16
-mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10
-mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04
-mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C
+mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10
+mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04
+mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C
SECTION .text
%ifdef X86_32
@@ -48,113 +48,113 @@
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;*********************************************************************************************************************
WELS_EXTERN SumOf8x8BlockOfFrame_sse2
-%define pushsize 16
-%define localsize 4
-%define ref esp + pushsize + localsize + 4
-%define sum_ref esp + pushsize + localsize + 20
-%define times_of_sum esp + pushsize + localsize + 24
-%define width esp + pushsize + localsize + 8
-%define height esp + pushsize + localsize + 12
-%define linesize esp + pushsize + localsize + 16
-%define tmp_width esp + 0
- push ebx
- push ebp
- push esi
- push edi
- sub esp, localsize
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp + 0
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
- pxor xmm0, xmm0
- mov esi, [ref]
- mov edi, [sum_ref]
- mov edx, [times_of_sum]
- mov ebx, [linesize]
- mov eax, [width]
- lea ecx, [ebx+ebx*2] ; 3*linesize
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
+ lea ecx, [ebx+ebx*2] ; 3*linesize
- mov [tmp_width], eax
- lea ebp, [esi+ebx*4]
+ mov [tmp_width], eax
+ lea ebp, [esi+ebx*4]
FIRST_ROW:
- movq xmm1, [esi]
- movq xmm2, [esi+ebx]
- movq xmm3, [esi+ebx*2]
- movq xmm4, [esi+ecx]
+ movq xmm1, [esi]
+ movq xmm2, [esi+ebx]
+ movq xmm3, [esi+ebx*2]
+ movq xmm4, [esi+ecx]
- shufps xmm1, xmm2, 01000100b
- shufps xmm3, xmm4, 01000100b
- psadbw xmm1, xmm0
- psadbw xmm3, xmm0
- paddd xmm1, xmm3
+ shufps xmm1, xmm2, 01000100b
+ shufps xmm3, xmm4, 01000100b
+ psadbw xmm1, xmm0
+ psadbw xmm3, xmm0
+ paddd xmm1, xmm3
- movq xmm2, [ebp]
- movq xmm3, [ebp+ebx]
- movq xmm4, [ebp+ebx*2]
- movq xmm5, [ebp+ecx]
+ movq xmm2, [ebp]
+ movq xmm3, [ebp+ebx]
+ movq xmm4, [ebp+ebx*2]
+ movq xmm5, [ebp+ecx]
- shufps xmm2, xmm3, 01000100b
- shufps xmm4, xmm5, 01000100b
- psadbw xmm2, xmm0
- psadbw xmm4, xmm0
- paddd xmm2, xmm4
+ shufps xmm2, xmm3, 01000100b
+ shufps xmm4, xmm5, 01000100b
+ psadbw xmm2, xmm0
+ psadbw xmm4, xmm0
+ paddd xmm2, xmm4
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd eax, xmm1
- mov [edi], ax
- inc dword [edx+eax*4]
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd eax, xmm1
+ mov [edi], ax
+ inc dword [edx+eax*4]
- inc esi
- inc ebp
- add edi, 2
+ inc esi
+ inc ebp
+ add edi, 2
- dec dword [tmp_width]
- jg FIRST_ROW
+ dec dword [tmp_width]
+ jg FIRST_ROW
- mov esi, [ref]
- mov edi, [sum_ref]
- mov ebp, [width]
- dec dword [height]
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
HEIGHT_LOOP:
- mov [tmp_width], ebp
+ mov [tmp_width], ebp
WIDTH_LOOP:
- movq xmm1, [esi+ebx*8]
- movq xmm2, [esi]
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psubd xmm1, xmm2
- movd eax, xmm1
- mov cx, [edi]
- add eax, ecx
+ movq xmm1, [esi+ebx*8]
+ movq xmm2, [esi]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubd xmm1, xmm2
+ movd eax, xmm1
+ mov cx, [edi]
+ add eax, ecx
- mov [edi+ebp*2], ax
- inc dword [edx+eax*4]
+ mov [edi+ebp*2], ax
+ inc dword [edx+eax*4]
- inc esi
- add edi, 2
+ inc esi
+ add edi, 2
- dec dword [tmp_width]
- jg WIDTH_LOOP
+ dec dword [tmp_width]
+ jg WIDTH_LOOP
- add esi, ebx
- sub esi, ebp
+ add esi, ebx
+ sub esi, ebp
- dec dword [height]
- jg HEIGHT_LOOP
+ dec dword [height]
+ jg HEIGHT_LOOP
- add esp, localsize
- pop edi
- pop esi
- pop ebp
- pop ebx
-%undef pushsize
-%undef localsize
-%undef ref
-%undef sum_ref
-%undef times_of_sum
-%undef width
-%undef height
-%undef linesize
-%undef tmp_width
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
ret
@@ -161,10 +161,10 @@
%macro COUNT_SUM 3
%define xmm_reg %1
%define tmp_reg %2
- movd tmp_reg, xmm_reg
- inc dword [edx+tmp_reg*4]
+ movd tmp_reg, xmm_reg
+ inc dword [edx+tmp_reg*4]
%if %3 == 1
- psrldq xmm_reg, 4
+ psrldq xmm_reg, 4
%endif
%endmacro
@@ -178,177 +178,177 @@
; read extra (16 - (width % 8) ) mod 16 bytes of every line
; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref
WELS_EXTERN SumOf8x8BlockOfFrame_sse4
-%define pushsize 16
-%define localsize 4
-%define ref esp + pushsize + localsize + 4
-%define sum_ref esp + pushsize + localsize + 20
-%define times_of_sum esp + pushsize + localsize + 24
-%define width esp + pushsize + localsize + 8
-%define height esp + pushsize + localsize + 12
-%define linesize esp + pushsize + localsize + 16
-%define tmp_width esp + 0
- push ebx
- push ebp
- push esi
- push edi
- sub esp, localsize
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp + 0
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
- pxor xmm0, xmm0
- mov esi, [ref]
- mov edi, [sum_ref]
- mov edx, [times_of_sum]
- mov ebx, [linesize]
- mov eax, [width]
- lea ecx, [ebx+ebx*2] ; 3*linesize
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
+ lea ecx, [ebx+ebx*2] ; 3*linesize
- mov [tmp_width], eax
- lea ebp, [esi+ebx*4]
+ mov [tmp_width], eax
+ lea ebp, [esi+ebx*4]
FIRST_ROW_SSE4:
- movdqu xmm1, [esi]
- movdqu xmm3, [esi+ebx]
- movdqu xmm5, [esi+ebx*2]
- movdqu xmm7, [esi+ecx]
+ movdqu xmm1, [esi]
+ movdqu xmm3, [esi+ebx]
+ movdqu xmm5, [esi+ebx*2]
+ movdqu xmm7, [esi+ecx]
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 000b
- mpsadbw xmm2, xmm0, 100b
- paddw xmm1, xmm2 ; 8 sums of line1
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm1, xmm2 ; 8 sums of line1
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 000b
- mpsadbw xmm4, xmm0, 100b
- paddw xmm3, xmm4 ; 8 sums of line2
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm3, xmm4 ; 8 sums of line2
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 000b
- mpsadbw xmm2, xmm0, 100b
- paddw xmm5, xmm2 ; 8 sums of line3
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm5, xmm2 ; 8 sums of line3
- movdqa xmm4, xmm7
- mpsadbw xmm7, xmm0, 000b
- mpsadbw xmm4, xmm0, 100b
- paddw xmm7, xmm4 ; 8 sums of line4
+ movdqa xmm4, xmm7
+ mpsadbw xmm7, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm7, xmm4 ; 8 sums of line4
- paddw xmm1, xmm3
- paddw xmm5, xmm7
- paddw xmm1, xmm5 ; sum the upper 4 lines first
+ paddw xmm1, xmm3
+ paddw xmm5, xmm7
+ paddw xmm1, xmm5 ; sum the upper 4 lines first
- movdqu xmm2, [ebp]
- movdqu xmm3, [ebp+ebx]
- movdqu xmm4, [ebp+ebx*2]
- movdqu xmm5, [ebp+ecx]
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
- movdqa xmm6, xmm2
- mpsadbw xmm2, xmm0, 000b
- mpsadbw xmm6, xmm0, 100b
- paddw xmm2, xmm6
+ movdqa xmm6, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm2, xmm6
- movdqa xmm7, xmm3
- mpsadbw xmm3, xmm0, 000b
- mpsadbw xmm7, xmm0, 100b
- paddw xmm3, xmm7
+ movdqa xmm7, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm3, xmm7
- movdqa xmm6, xmm4
- mpsadbw xmm4, xmm0, 000b
- mpsadbw xmm6, xmm0, 100b
- paddw xmm4, xmm6
+ movdqa xmm6, xmm4
+ mpsadbw xmm4, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm4, xmm6
- movdqa xmm7, xmm5
- mpsadbw xmm5, xmm0, 000b
- mpsadbw xmm7, xmm0, 100b
- paddw xmm5, xmm7
+ movdqa xmm7, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm1, xmm2
- paddw xmm1, xmm4 ; sum of lines 1- 8
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm1, xmm2
+ paddw xmm1, xmm4 ; sum of lines 1- 8
- movdqu [edi], xmm1
+ movdqu [edi], xmm1
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm0
- punpckhwd xmm2, xmm0
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
- COUNT_SUM xmm1, eax, 1
- COUNT_SUM xmm1, eax, 1
- COUNT_SUM xmm1, eax, 1
- COUNT_SUM xmm1, eax, 0
- COUNT_SUM xmm2, eax, 1
- COUNT_SUM xmm2, eax, 1
- COUNT_SUM xmm2, eax, 1
- COUNT_SUM xmm2, eax, 0
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 0
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 0
- lea esi, [esi+8]
- lea ebp, [ebp+8]
- lea edi, [edi+16] ; element size is 2
+ lea esi, [esi+8]
+ lea ebp, [ebp+8]
+ lea edi, [edi+16] ; element size is 2
- sub dword [tmp_width], 8
- jg near FIRST_ROW_SSE4
+ sub dword [tmp_width], 8
+ jg near FIRST_ROW_SSE4
- mov esi, [ref]
- mov edi, [sum_ref]
- mov ebp, [width]
- dec dword [height]
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
HEIGHT_LOOP_SSE4:
- mov ecx, ebp
+ mov ecx, ebp
WIDTH_LOOP_SSE4:
- movdqu xmm1, [esi+ebx*8]
- movdqu xmm2, [esi]
- movdqu xmm7, [edi]
+ movdqu xmm1, [esi+ebx*8]
+ movdqu xmm2, [esi]
+ movdqu xmm7, [edi]
- movdqa xmm3, xmm1
- mpsadbw xmm1, xmm0, 000b
- mpsadbw xmm3, xmm0, 100b
- paddw xmm1, xmm3
+ movdqa xmm3, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm3, xmm0, 100b
+ paddw xmm1, xmm3
- movdqa xmm4, xmm2
- mpsadbw xmm2, xmm0, 000b
- mpsadbw xmm4, xmm0, 100b
- paddw xmm2, xmm4
+ movdqa xmm4, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm2, xmm4
- paddw xmm7, xmm1
- psubw xmm7, xmm2
- movdqu [edi+ebp*2], xmm7
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqu [edi+ebp*2], xmm7
- movdqa xmm6, xmm7
- punpcklwd xmm7, xmm0
- punpckhwd xmm6, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
- COUNT_SUM xmm7, eax, 1
- COUNT_SUM xmm7, eax, 1
- COUNT_SUM xmm7, eax, 1
- COUNT_SUM xmm7, eax, 0
- COUNT_SUM xmm6, eax, 1
- COUNT_SUM xmm6, eax, 1
- COUNT_SUM xmm6, eax, 1
- COUNT_SUM xmm6, eax, 0
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 0
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 0
- lea esi, [esi+8]
- lea edi, [edi+16]
+ lea esi, [esi+8]
+ lea edi, [edi+16]
- sub ecx, 8
- jg near WIDTH_LOOP_SSE4
+ sub ecx, 8
+ jg near WIDTH_LOOP_SSE4
- lea esi, [esi+ebx]
- sub esi, ebp
+ lea esi, [esi+ebx]
+ sub esi, ebp
- dec dword [height]
- jg near HEIGHT_LOOP_SSE4
+ dec dword [height]
+ jg near HEIGHT_LOOP_SSE4
- add esp, localsize
- pop edi
- pop esi
- pop ebp
- pop ebx
-%undef pushsize
-%undef localsize
-%undef ref
-%undef sum_ref
-%undef times_of_sum
-%undef width
-%undef height
-%undef linesize
-%undef tmp_width
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
ret
@@ -357,153 +357,153 @@
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;****************************************************************************************************************************************************
WELS_EXTERN SumOf16x16BlockOfFrame_sse2
-%define pushsize 16
-%define localsize 4
-%define ref esp + pushsize + localsize + 4
-%define sum_ref esp + pushsize + localsize + 20
-%define times_of_sum esp + pushsize + localsize + 24
-%define width esp + pushsize + localsize + 8
-%define height esp + pushsize + localsize + 12
-%define linesize esp + pushsize + localsize + 16
-%define tmp_width esp
- push ebx
- push ebp
- push esi
- push edi
- sub esp, localsize
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
- pxor xmm0, xmm0
- mov esi, [ref]
- mov edi, [sum_ref]
- mov edx, [times_of_sum]
- mov ebx, [linesize]
- mov eax, [width]
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
- lea ecx, [ebx+ebx*2]
- mov [tmp_width], eax
+ lea ecx, [ebx+ebx*2]
+ mov [tmp_width], eax
FIRST_ROW_X16H:
- movdqu xmm1, [esi]
- movdqu xmm2, [esi+ebx]
- movdqu xmm3, [esi+ebx*2]
- movdqu xmm4, [esi+ecx]
+ movdqu xmm1, [esi]
+ movdqu xmm2, [esi+ebx]
+ movdqu xmm3, [esi+ebx*2]
+ movdqu xmm4, [esi+ecx]
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm1, xmm3
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
- lea ebp, [esi+ebx*4]
- movdqu xmm2, [ebp]
- movdqu xmm3, [ebp+ebx]
- movdqu xmm4, [ebp+ebx*2]
- movdqu xmm5, [ebp+ecx]
+ lea ebp, [esi+ebx*4]
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm2, xmm4
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
- paddw xmm1, xmm2
+ paddw xmm1, xmm2
- lea ebp, [ebp+ebx*4]
- movdqu xmm2, [ebp]
- movdqu xmm3, [ebp+ebx]
- movdqu xmm4, [ebp+ebx*2]
- movdqu xmm5, [ebp+ecx]
+ lea ebp, [ebp+ebx*4]
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm2, xmm4
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
- paddw xmm1, xmm2
+ paddw xmm1, xmm2
- lea ebp, [ebp+ebx*4]
- movdqu xmm2, [ebp]
- movdqu xmm3, [ebp+ebx]
- movdqu xmm4, [ebp+ebx*2]
- movdqu xmm5, [ebp+ecx]
+ lea ebp, [ebp+ebx*4]
+ movdqu xmm2, [ebp]
+ movdqu xmm3, [ebp+ebx]
+ movdqu xmm4, [ebp+ebx*2]
+ movdqu xmm5, [ebp+ecx]
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm2, xmm4
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
- paddw xmm1, xmm2
- movdqa xmm2, xmm1
+ paddw xmm1, xmm2
+ movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
- movd eax, xmm1
- mov [edi], ax
- inc dword [edx+eax*4]
+ movd eax, xmm1
+ mov [edi], ax
+ inc dword [edx+eax*4]
- inc esi
- lea edi, [edi+2]
+ inc esi
+ lea edi, [edi+2]
- dec dword [tmp_width]
- jg near FIRST_ROW_X16H
+ dec dword [tmp_width]
+ jg near FIRST_ROW_X16H
- mov esi, [ref]
- mov edi, [sum_ref]
- mov ebp, [width]
- dec dword [height]
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
- mov ecx, ebx
- sal ecx, 4 ; succeeded 16th line
+ mov ecx, ebx
+ sal ecx, 4 ; succeeded 16th line
HEIGHT_LOOP_X16:
- mov [tmp_width], ebp
+ mov [tmp_width], ebp
WIDTH_LOOP_X16:
- movdqu xmm1, [esi+ecx]
- movdqu xmm2, [esi]
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psubw xmm1, xmm2
- movdqa xmm2, xmm1
+ movdqu xmm1, [esi+ecx]
+ movdqu xmm2, [esi]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubw xmm1, xmm2
+ movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
- paddw xmm1, xmm2
- movd eax, xmm1
- add ax, word [edi]
- mov [edi+ebp*2], ax
- inc dword [edx+eax*4]
+ paddw xmm1, xmm2
+ movd eax, xmm1
+ add ax, word [edi]
+ mov [edi+ebp*2], ax
+ inc dword [edx+eax*4]
- inc esi
- add edi, 2
+ inc esi
+ add edi, 2
- dec dword [tmp_width]
- jg near WIDTH_LOOP_X16
+ dec dword [tmp_width]
+ jg near WIDTH_LOOP_X16
- add esi, ebx
- sub esi, ebp
+ add esi, ebx
+ sub esi, ebp
- dec dword [height]
- jg near HEIGHT_LOOP_X16
+ dec dword [height]
+ jg near HEIGHT_LOOP_X16
- add esp, localsize
- pop edi
- pop esi
- pop ebp
- pop ebx
-%undef pushsize
-%undef localsize
-%undef ref
-%undef sum_ref
-%undef times_of_sum
-%undef width
-%undef height
-%undef linesize
-%undef tmp_width
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
ret
; requires: width % 16 == 0 && height > 1
@@ -512,163 +512,163 @@
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;-----------------------------------------------------------------------------------------------------------------------------
; try 8 mv via offset
-%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
- movdqu %2, [%1]
- movdqu %3, [%1+8h]
- movdqa %4, %2
- movdqa %5, %3
+%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
+ movdqu %2, [%1]
+ movdqu %3, [%1+8h]
+ movdqa %4, %2
+ movdqa %5, %3
- mpsadbw %2, xmm0, 0 ; 000 B
- mpsadbw %4, xmm0, 5 ; 101 B
- mpsadbw %3, xmm0, 2 ; 010 B
- mpsadbw %5, xmm0, 7 ; 111 B
- paddw %2, %4
- paddw %3, %5
- paddw %2, %3 ; accumulate cost
-%endmacro ; end of SAD_16x16_LINE_SSE41
+ mpsadbw %2, xmm0, 0 ; 000 B
+ mpsadbw %4, xmm0, 5 ; 101 B
+ mpsadbw %3, xmm0, 2 ; 010 B
+ mpsadbw %5, xmm0, 7 ; 111 B
+ paddw %2, %4
+ paddw %3, %5
+ paddw %2, %3 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41
WELS_EXTERN SumOf16x16BlockOfFrame_sse4
-%define pushsize 16
-%define localsize 4
-%define ref esp + pushsize + localsize + 4
-%define sum_ref esp + pushsize + localsize + 20
-%define times_of_sum esp + pushsize + localsize + 24
-%define width esp + pushsize + localsize + 8
-%define height esp + pushsize + localsize + 12
-%define linesize esp + pushsize + localsize + 16
-%define tmp_width esp
- push ebx
- push ebp
- push esi
- push edi
- sub esp, localsize
+%define pushsize 16
+%define localsize 4
+%define ref esp + pushsize + localsize + 4
+%define sum_ref esp + pushsize + localsize + 20
+%define times_of_sum esp + pushsize + localsize + 24
+%define width esp + pushsize + localsize + 8
+%define height esp + pushsize + localsize + 12
+%define linesize esp + pushsize + localsize + 16
+%define tmp_width esp
+ push ebx
+ push ebp
+ push esi
+ push edi
+ sub esp, localsize
- pxor xmm0, xmm0
- mov esi, [ref]
- mov edi, [sum_ref]
- mov edx, [times_of_sum]
- mov ebx, [linesize]
- mov eax, [width]
+ pxor xmm0, xmm0
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov edx, [times_of_sum]
+ mov ebx, [linesize]
+ mov eax, [width]
- lea ecx, [ebx+ebx*2]
- mov [tmp_width], eax
+ lea ecx, [ebx+ebx*2]
+ mov [tmp_width], eax
FIRST_ROW_X16_SSE4:
- SUM_LINE_X16_SSE41 esi, xmm1, xmm2, xmm3, xmm4
- SUM_LINE_X16_SSE41 esi+ebx, xmm2, xmm3, xmm4, xmm5
- SUM_LINE_X16_SSE41 esi+ebx*2, xmm3, xmm4, xmm5, xmm6
- SUM_LINE_X16_SSE41 esi+ecx, xmm4, xmm5, xmm6, xmm7
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm1, xmm3
+ SUM_LINE_X16_SSE41 esi, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 esi+ebx, xmm2, xmm3, xmm4, xmm5
+ SUM_LINE_X16_SSE41 esi+ebx*2, xmm3, xmm4, xmm5, xmm6
+ SUM_LINE_X16_SSE41 esi+ecx, xmm4, xmm5, xmm6, xmm7
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
- lea ebp, [esi+ebx*4]
- SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
+ lea ebp, [esi+ebx*4]
+ SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
- lea ebp, [ebp+ebx*4]
- SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
+ lea ebp, [ebp+ebx*4]
+ SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
- lea ebp, [ebp+ebx*4]
- SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
+ lea ebp, [ebp+ebx*4]
+ SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
- movdqa [edi], xmm1
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm0
- punpckhwd xmm2, xmm0
+ movdqa [edi], xmm1
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
- COUNT_SUM xmm1, eax, 1
- COUNT_SUM xmm1, eax, 1
- COUNT_SUM xmm1, eax, 1
- COUNT_SUM xmm1, eax, 0
- COUNT_SUM xmm2, eax, 1
- COUNT_SUM xmm2, eax, 1
- COUNT_SUM xmm2, eax, 1
- COUNT_SUM xmm2, eax, 0
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 1
+ COUNT_SUM xmm1, eax, 0
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 1
+ COUNT_SUM xmm2, eax, 0
- lea esi, [esi+8]
- lea edi, [edi+16] ; element size is 2
+ lea esi, [esi+8]
+ lea edi, [edi+16] ; element size is 2
- sub dword [tmp_width], 8
- jg near FIRST_ROW_X16_SSE4
+ sub dword [tmp_width], 8
+ jg near FIRST_ROW_X16_SSE4
- mov esi, [ref]
- mov edi, [sum_ref]
- mov ebp, [width]
- dec dword [height]
+ mov esi, [ref]
+ mov edi, [sum_ref]
+ mov ebp, [width]
+ dec dword [height]
- mov ecx, ebx
- sal ecx, 4 ; succeeded 16th line
+ mov ecx, ebx
+ sal ecx, 4 ; succeeded 16th line
HEIGHT_LOOP_X16_SSE4:
- mov [tmp_width], ebp
+ mov [tmp_width], ebp
WIDTH_LOOP_X16_SSE4:
- movdqa xmm7, [edi]
- SUM_LINE_X16_SSE41 esi+ecx, xmm1, xmm2, xmm3, xmm4
- SUM_LINE_X16_SSE41 esi, xmm2, xmm3, xmm4, xmm5
+ movdqa xmm7, [edi]
+ SUM_LINE_X16_SSE41 esi+ecx, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 esi, xmm2, xmm3, xmm4, xmm5
- paddw xmm7, xmm1
- psubw xmm7, xmm2
- movdqa [edi+ebp*2], xmm7
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqa [edi+ebp*2], xmm7
- movdqa xmm6, xmm7
- punpcklwd xmm7, xmm0
- punpckhwd xmm6, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
- COUNT_SUM xmm7, eax, 1
- COUNT_SUM xmm7, eax, 1
- COUNT_SUM xmm7, eax, 1
- COUNT_SUM xmm7, eax, 0
- COUNT_SUM xmm6, eax, 1
- COUNT_SUM xmm6, eax, 1
- COUNT_SUM xmm6, eax, 1
- COUNT_SUM xmm6, eax, 0
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 1
+ COUNT_SUM xmm7, eax, 0
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 1
+ COUNT_SUM xmm6, eax, 0
- lea esi, [esi+8]
- lea edi, [edi+16]
+ lea esi, [esi+8]
+ lea edi, [edi+16]
- sub dword [tmp_width], 8
- jg near WIDTH_LOOP_X16_SSE4
+ sub dword [tmp_width], 8
+ jg near WIDTH_LOOP_X16_SSE4
- add esi, ebx
- sub esi, ebp
+ add esi, ebx
+ sub esi, ebp
- dec dword [height]
- jg near HEIGHT_LOOP_X16_SSE4
+ dec dword [height]
+ jg near HEIGHT_LOOP_X16_SSE4
- add esp, localsize
- pop edi
- pop esi
- pop ebp
- pop ebx
-%undef pushsize
-%undef localsize
-%undef ref
-%undef sum_ref
-%undef times_of_sum
-%undef width
-%undef height
-%undef linesize
-%undef tmp_width
+ add esp, localsize
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+%undef pushsize
+%undef localsize
+%undef ref
+%undef sum_ref
+%undef times_of_sum
+%undef width
+%undef height
+%undef linesize
+%undef tmp_width
ret
@@ -676,78 +676,78 @@
; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
;-----------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN FillQpelLocationByFeatureValue_sse2
- push esi
- push edi
- push ebx
- push ebp
+ push esi
+ push edi
+ push ebx
+ push ebp
- %define _ps 16 ; push size
- %define _ls 4 ; local size
- %define sum_ref esp+_ps+_ls+4
- %define pos_list esp+_ps+_ls+16
- %define width esp+_ps+_ls+8
- %define height esp+_ps+_ls+12
- %define i_height esp
- sub esp, _ls
+ %define _ps 16 ; push size
+ %define _ls 4 ; local size
+ %define sum_ref esp+_ps+_ls+4
+ %define pos_list esp+_ps+_ls+16
+ %define width esp+_ps+_ls+8
+ %define height esp+_ps+_ls+12
+ %define i_height esp
+ sub esp, _ls
- mov esi, [sum_ref]
- mov edi, [pos_list]
- mov ebp, [width]
- mov ebx, [height]
- mov [i_height], ebx
+ mov esi, [sum_ref]
+ mov edi, [pos_list]
+ mov ebp, [width]
+ mov ebx, [height]
+ mov [i_height], ebx
- movq xmm7, [mv_x_inc_x4] ; x_qpel inc
- movq xmm6, [mv_y_inc_x4] ; y_qpel inc
- movq xmm5, [mx_x_offset_x4] ; x_qpel vector
- pxor xmm4, xmm4
- pxor xmm3, xmm3 ; y_qpel vector
+ movq xmm7, [mv_x_inc_x4] ; x_qpel inc
+ movq xmm6, [mv_y_inc_x4] ; y_qpel inc
+ movq xmm5, [mx_x_offset_x4] ; x_qpel vector
+ pxor xmm4, xmm4
+ pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
- movdqa xmm2, xmm5 ; x_qpel vector
- mov ecx, ebp
+ movdqa xmm2, xmm5 ; x_qpel vector
+ mov ecx, ebp
HASH_WIDTH_LOOP_SSE2:
- movq xmm0, [esi] ; load x8 sum
- punpcklwd xmm0, xmm4
- movdqa xmm1, xmm2
- punpcklwd xmm1, xmm3
-%rep 3
- movd edx, xmm0
- lea ebx, [edi+edx*4]
- mov eax, [ebx]
- movd [eax], xmm1
- mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
- lea eax, [eax+4]
- mov [ebx], eax
- psrldq xmm1, 4
- psrldq xmm0, 4
+ movq xmm0, [esi] ; load x8 sum
+ punpcklwd xmm0, xmm4
+ movdqa xmm1, xmm2
+ punpcklwd xmm1, xmm3
+%rep 3
+ movd edx, xmm0
+ lea ebx, [edi+edx*4]
+ mov eax, [ebx]
+ movd [eax], xmm1
+ mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea eax, [eax+4]
+ mov [ebx], eax
+ psrldq xmm1, 4
+ psrldq xmm0, 4
%endrep
- movd edx, xmm0
- lea ebx, [edi+edx*4]
- mov eax, [ebx]
- movd [eax], xmm1
- mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
- lea eax, [eax+4]
- mov [ebx], eax
+ movd edx, xmm0
+ lea ebx, [edi+edx*4]
+ mov eax, [ebx]
+ movd [eax], xmm1
+ mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea eax, [eax+4]
+ mov [ebx], eax
- paddw xmm2, xmm7
- lea esi, [esi+8]
- sub ecx, 4
+ paddw xmm2, xmm7
+ lea esi, [esi+8]
+ sub ecx, 4
jnz near HASH_WIDTH_LOOP_SSE2
- paddw xmm3, xmm6
- dec dword [i_height]
- jnz near HASH_HEIGHT_LOOP_SSE2
+ paddw xmm3, xmm6
+ dec dword [i_height]
+ jnz near HASH_HEIGHT_LOOP_SSE2
- add esp, _ls
- %undef _ps
- %undef _ls
- %undef sum_ref
- %undef pos_list
- %undef width
- %undef height
- %undef i_height
- pop ebp
- pop ebx
- pop edi
- pop esi
+ add esp, _ls
+ %undef _ps
+ %undef _ls
+ %undef sum_ref
+ %undef pos_list
+ %undef width
+ %undef height
+ %undef i_height
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
ret
;---------------------------------------------------------------------------------------------------------------------------------------------------
@@ -755,74 +755,74 @@
; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
;---------------------------------------------------------------------------------------------------------------------------------------------------
WELS_EXTERN InitializeHashforFeature_sse2
- push ebx
- push esi
- push edi
- push ebp
- %define _ps 16 ; push size
- mov edi, [esp+_ps+16] ; pPositionOfSum
- mov ebp, [esp+_ps+20] ; sum_idx_list
- mov esi, [esp+_ps+4] ; pTimesOfSum
- mov ebx, [esp+_ps+8] ; pBuf
- mov edx, [esp+_ps+12] ; list_sz
- sar edx, 2
- mov ecx, 0
- pxor xmm7, xmm7
+ push ebx
+ push esi
+ push edi
+ push ebp
+ %define _ps 16 ; push size
+ mov edi, [esp+_ps+16] ; pPositionOfSum
+ mov ebp, [esp+_ps+20] ; sum_idx_list
+ mov esi, [esp+_ps+4] ; pTimesOfSum
+ mov ebx, [esp+_ps+8] ; pBuf
+ mov edx, [esp+_ps+12] ; list_sz
+ sar edx, 2
+ mov ecx, 0
+ pxor xmm7, xmm7
hash_assign_loop_x4_sse2:
- movdqa xmm0, [esi+ecx]
- pslld xmm0, 2
+ movdqa xmm0, [esi+ecx]
+ pslld xmm0, 2
- movdqa xmm1, xmm0
- pcmpeqd xmm1, xmm7
- movmskps eax, xmm1
+ movdqa xmm1, xmm0
+ pcmpeqd xmm1, xmm7
+ movmskps eax, xmm1
cmp eax, 0x0f
- je near hash_assign_with_copy_sse2
+ je near hash_assign_with_copy_sse2
-%assign x 0
+%assign x 0
%rep 4
- lea eax, [edi+ecx+x]
- mov [eax], ebx
- lea eax, [ebp+ecx+x]
- mov [eax], ebx
- movd eax, xmm0
- add ebx, eax
- psrldq xmm0, 4
-%assign x x+4
+ lea eax, [edi+ecx+x]
+ mov [eax], ebx
+ lea eax, [ebp+ecx+x]
+ mov [eax], ebx
+ movd eax, xmm0
+ add ebx, eax
+ psrldq xmm0, 4
+%assign x x+4
%endrep
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
- movd xmm1, ebx
- pshufd xmm2, xmm1, 0
- movdqa [edi+ecx], xmm2
- movdqa [ebp+ecx], xmm2
+ movd xmm1, ebx
+ pshufd xmm2, xmm1, 0
+ movdqa [edi+ecx], xmm2
+ movdqa [ebp+ecx], xmm2
assign_next_sse2:
- add ecx, 16
- dec edx
- jnz near hash_assign_loop_x4_sse2
+ add ecx, 16
+ dec edx
+ jnz near hash_assign_loop_x4_sse2
- mov edx, [esp+_ps+12] ; list_sz
- and edx, 3
- jz near hash_assign_no_rem_sse2
+ mov edx, [esp+_ps+12] ; list_sz
+ and edx, 3
+ jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
- lea eax, [edi+ecx]
- mov [eax], ebx
- lea eax, [ebp+ecx]
- mov [eax], ebx
- mov eax, [esi+ecx]
- sal eax, 2
- add ebx, eax
- add ecx, 4
- dec edx
- jnz near hash_assign_loop_x4_rem_sse2
+ lea eax, [edi+ecx]
+ mov [eax], ebx
+ lea eax, [ebp+ecx]
+ mov [eax], ebx
+ mov eax, [esi+ecx]
+ sal eax, 2
+ add ebx, eax
+ add ecx, 4
+ dec edx
+ jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
- %undef _ps
- pop ebp
- pop edi
- pop esi
- pop ebx
+ %undef _ps
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
ret
%else
@@ -843,47 +843,47 @@
push r2
push r4
- pxor xmm0, xmm0
+ pxor xmm0, xmm0
lea r6, [r3+r3*2]
- mov r12, r1 ;r12:tmp_width
- lea r13, [r0+r3*4] ;rbp:r13
+ mov r12, r1 ;r12:tmp_width
+ lea r13, [r0+r3*4] ;rbp:r13
FIRST_ROW:
- movq xmm1, [r0]
- movq xmm2, [r0+r3]
- movq xmm3, [r0+r3*2]
- movq xmm4, [r0+r6]
+ movq xmm1, [r0]
+ movq xmm2, [r0+r3]
+ movq xmm3, [r0+r3*2]
+ movq xmm4, [r0+r6]
- shufps xmm1, xmm2, 01000100b
- shufps xmm3, xmm4, 01000100b
- psadbw xmm1, xmm0
- psadbw xmm3, xmm0
- paddd xmm1, xmm3
+ shufps xmm1, xmm2, 01000100b
+ shufps xmm3, xmm4, 01000100b
+ psadbw xmm1, xmm0
+ psadbw xmm3, xmm0
+ paddd xmm1, xmm3
- movq xmm2, [r13]
- movq xmm3, [r13+r3]
- movq xmm4, [r13+r3*2]
- movq xmm5, [r13+r6]
+ movq xmm2, [r13]
+ movq xmm3, [r13+r3]
+ movq xmm4, [r13+r3*2]
+ movq xmm5, [r13+r6]
- shufps xmm2, xmm3, 01000100b
- shufps xmm4, xmm5, 01000100b
- psadbw xmm2, xmm0
- psadbw xmm4, xmm0
- paddd xmm2, xmm4
+ shufps xmm2, xmm3, 01000100b
+ shufps xmm4, xmm5, 01000100b
+ psadbw xmm2, xmm0
+ psadbw xmm4, xmm0
+ paddd xmm2, xmm4
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd r2d, xmm1
- mov [r4], r2w
- inc dword [r5+r2*4]
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd r2d, xmm1
+ mov [r4], r2w
+ inc dword [r5+r2*4]
- inc r0
- inc r13
- add r4, 2
+ inc r0
+ inc r13
+ add r4, 2
- dec r12
- jg FIRST_ROW
+ dec r12
+ jg FIRST_ROW
pop r4
pop r2
@@ -891,34 +891,34 @@
mov r13, r2
dec r13
HEIGHT_LOOP:
- mov r12, r1
+ mov r12, r1
WIDTH_LOOP:
- movq xmm1, [r0+r3*8]
- movq xmm2, [r0]
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psubd xmm1, xmm2
- movd r2d, xmm1
- mov r6w, [r4]
- add r2d, r6d
- mov [r4+r1*2], r2w
- inc dword [r5+r2*4]
+ movq xmm1, [r0+r3*8]
+ movq xmm2, [r0]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubd xmm1, xmm2
+ movd r2d, xmm1
+ mov r6w, [r4]
+ add r2d, r6d
+ mov [r4+r1*2], r2w
+ inc dword [r5+r2*4]
- inc r0
- add r4, 2
+ inc r0
+ add r4, 2
- dec r12
- jg WIDTH_LOOP
+ dec r12
+ jg WIDTH_LOOP
- add r0, r3
- sub r0, r1
+ add r0, r3
+ sub r0, r1
- dec r13
- jg HEIGHT_LOOP
+ dec r13
+ jg HEIGHT_LOOP
- pop r13
- pop r12
+ pop r13
+ pop r12
POP_XMM
LOAD_6_PARA_POP
ret
@@ -928,10 +928,10 @@
%define xmm_reg %1
%define tmp_dreg %2
%define tmp_qreg %3
- movd tmp_dreg, xmm_reg
- inc dword [r5+tmp_qreg*4]
+ movd tmp_dreg, xmm_reg
+ inc dword [r5+tmp_qreg*4]
%if %4 == 1
- psrldq xmm_reg, 4
+ psrldq xmm_reg, 4
%endif
%endmacro
@@ -957,92 +957,92 @@
push r2
push r4
- pxor xmm0, xmm0
+ pxor xmm0, xmm0
lea r6, [r3+r3*2]
- mov r12, r1 ;r12:tmp_width
- lea r13, [r0+r3*4] ;rbp:r13
+ mov r12, r1 ;r12:tmp_width
+ lea r13, [r0+r3*4] ;rbp:r13
FIRST_ROW_SSE4:
- movdqu xmm1, [r0]
- movdqu xmm3, [r0+r3]
- movdqu xmm5, [r0+r3*2]
- movdqu xmm7, [r0+r6]
+ movdqu xmm1, [r0]
+ movdqu xmm3, [r0+r3]
+ movdqu xmm5, [r0+r3*2]
+ movdqu xmm7, [r0+r6]
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 000b
- mpsadbw xmm2, xmm0, 100b
- paddw xmm1, xmm2 ; 8 sums of line1
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm1, xmm2 ; 8 sums of line1
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 000b
- mpsadbw xmm4, xmm0, 100b
- paddw xmm3, xmm4 ; 8 sums of line2
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm3, xmm4 ; 8 sums of line2
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 000b
- mpsadbw xmm2, xmm0, 100b
- paddw xmm5, xmm2 ; 8 sums of line3
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm2, xmm0, 100b
+ paddw xmm5, xmm2 ; 8 sums of line3
- movdqa xmm4, xmm7
- mpsadbw xmm7, xmm0, 000b
- mpsadbw xmm4, xmm0, 100b
- paddw xmm7, xmm4 ; 8 sums of line4
+ movdqa xmm4, xmm7
+ mpsadbw xmm7, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm7, xmm4 ; 8 sums of line4
- paddw xmm1, xmm3
- paddw xmm5, xmm7
- paddw xmm1, xmm5 ; sum the upper 4 lines first
+ paddw xmm1, xmm3
+ paddw xmm5, xmm7
+ paddw xmm1, xmm5 ; sum the upper 4 lines first
- movdqu xmm2, [r13]
- movdqu xmm3, [r13+r3]
- movdqu xmm4, [r13+r3*2]
- movdqu xmm5, [r13+r6]
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
- movdqa xmm6, xmm2
- mpsadbw xmm2, xmm0, 000b
- mpsadbw xmm6, xmm0, 100b
- paddw xmm2, xmm6
+ movdqa xmm6, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm2, xmm6
- movdqa xmm7, xmm3
- mpsadbw xmm3, xmm0, 000b
- mpsadbw xmm7, xmm0, 100b
- paddw xmm3, xmm7
+ movdqa xmm7, xmm3
+ mpsadbw xmm3, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm3, xmm7
- movdqa xmm6, xmm4
- mpsadbw xmm4, xmm0, 000b
- mpsadbw xmm6, xmm0, 100b
- paddw xmm4, xmm6
+ movdqa xmm6, xmm4
+ mpsadbw xmm4, xmm0, 000b
+ mpsadbw xmm6, xmm0, 100b
+ paddw xmm4, xmm6
- movdqa xmm7, xmm5
- mpsadbw xmm5, xmm0, 000b
- mpsadbw xmm7, xmm0, 100b
- paddw xmm5, xmm7
+ movdqa xmm7, xmm5
+ mpsadbw xmm5, xmm0, 000b
+ mpsadbw xmm7, xmm0, 100b
+ paddw xmm5, xmm7
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm1, xmm2
- paddw xmm1, xmm4 ; sum of lines 1- 8
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm1, xmm2
+ paddw xmm1, xmm4 ; sum of lines 1- 8
- movdqu [r4], xmm1
+ movdqu [r4], xmm1
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm0
- punpckhwd xmm2, xmm0
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
- COUNT_SUM xmm1, r2d, r2, 1
- COUNT_SUM xmm1, r2d, r2, 1
- COUNT_SUM xmm1, r2d, r2, 1
- COUNT_SUM xmm1, r2d, r2, 0
- COUNT_SUM xmm2, r2d, r2 ,1
- COUNT_SUM xmm2, r2d, r2 ,1
- COUNT_SUM xmm2, r2d, r2 ,1
- COUNT_SUM xmm2, r2d, r2 ,0
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 0
+ COUNT_SUM xmm2, r2d, r2 ,1
+ COUNT_SUM xmm2, r2d, r2 ,1
+ COUNT_SUM xmm2, r2d, r2 ,1
+ COUNT_SUM xmm2, r2d, r2 ,0
- lea r0, [r0+8]
- lea r13, [r13+8]
- lea r4, [r4+16] ; element size is 2
+ lea r0, [r0+8]
+ lea r13, [r13+8]
+ lea r4, [r4+16] ; element size is 2
- sub r12, 8
- jg near FIRST_ROW_SSE4
+ sub r12, 8
+ jg near FIRST_ROW_SSE4
pop r4
pop r2
@@ -1050,53 +1050,53 @@
mov r13, r2
dec r13
HEIGHT_LOOP_SSE4:
- mov r12, r1
+ mov r12, r1
WIDTH_LOOP_SSE4:
- movdqu xmm1, [r0+r3*8]
- movdqu xmm2, [r0]
- movdqu xmm7, [r4]
+ movdqu xmm1, [r0+r3*8]
+ movdqu xmm2, [r0]
+ movdqu xmm7, [r4]
- movdqa xmm3, xmm1
- mpsadbw xmm1, xmm0, 000b
- mpsadbw xmm3, xmm0, 100b
- paddw xmm1, xmm3
+ movdqa xmm3, xmm1
+ mpsadbw xmm1, xmm0, 000b
+ mpsadbw xmm3, xmm0, 100b
+ paddw xmm1, xmm3
- movdqa xmm4, xmm2
- mpsadbw xmm2, xmm0, 000b
- mpsadbw xmm4, xmm0, 100b
- paddw xmm2, xmm4
+ movdqa xmm4, xmm2
+ mpsadbw xmm2, xmm0, 000b
+ mpsadbw xmm4, xmm0, 100b
+ paddw xmm2, xmm4
- paddw xmm7, xmm1
- psubw xmm7, xmm2
- movdqu [r4+r1*2], xmm7
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqu [r4+r1*2], xmm7
- movdqa xmm6, xmm7
- punpcklwd xmm7, xmm0
- punpckhwd xmm6, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
- COUNT_SUM xmm7, r2d, r2, 1
- COUNT_SUM xmm7, r2d, r2, 1
- COUNT_SUM xmm7, r2d, r2, 1
- COUNT_SUM xmm7, r2d, r2, 0
- COUNT_SUM xmm6, r2d, r2, 1
- COUNT_SUM xmm6, r2d, r2, 1
- COUNT_SUM xmm6, r2d, r2, 1
- COUNT_SUM xmm6, r2d, r2, 0
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 0
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 0
- lea r0, [r0+8]
- lea r4, [r4+16]
+ lea r0, [r0+8]
+ lea r4, [r4+16]
- sub r12, 8
- jg near WIDTH_LOOP_SSE4
+ sub r12, 8
+ jg near WIDTH_LOOP_SSE4
- lea r0, [r0+r3]
- sub r0, r1
+ lea r0, [r0+r3]
+ sub r0, r1
- dec r13
- jg near HEIGHT_LOOP_SSE4
+ dec r13
+ jg near HEIGHT_LOOP_SSE4
- pop r13
- pop r12
+ pop r13
+ pop r12
POP_XMM
LOAD_6_PARA_POP
ret
@@ -1119,83 +1119,83 @@
push r2
push r4
- pxor xmm0, xmm0
+ pxor xmm0, xmm0
lea r6, [r3+r3*2]
- mov r12, r1 ;r12:tmp_width
+ mov r12, r1 ;r12:tmp_width
FIRST_ROW_X16H:
- movdqu xmm1, [r0]
- movdqu xmm2, [r0+r3]
- movdqu xmm3, [r0+r3*2]
- movdqu xmm4, [r0+r6]
+ movdqu xmm1, [r0]
+ movdqu xmm2, [r0+r3]
+ movdqu xmm3, [r0+r3*2]
+ movdqu xmm4, [r0+r6]
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm1, xmm3
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
- lea r13, [r0+r3*4] ;ebp:r13
- movdqu xmm2, [r13]
- movdqu xmm3, [r13+r3]
- movdqu xmm4, [r13+r3*2]
- movdqu xmm5, [r13+r6]
+ lea r13, [r0+r3*4] ;ebp:r13
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm2, xmm4
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
- paddw xmm1, xmm2
+ paddw xmm1, xmm2
- lea r13, [r13+r3*4]
- movdqu xmm2, [r13]
- movdqu xmm3, [r13+r3]
- movdqu xmm4, [r13+r3*2]
- movdqu xmm5, [r13+r6]
+ lea r13, [r13+r3*4]
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm2, xmm4
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
- paddw xmm1, xmm2
+ paddw xmm1, xmm2
- lea r13, [r13+r3*4]
- movdqu xmm2, [r13]
- movdqu xmm3, [r13+r3]
- movdqu xmm4, [r13+r3*2]
- movdqu xmm5, [r13+r6]
+ lea r13, [r13+r3*4]
+ movdqu xmm2, [r13]
+ movdqu xmm3, [r13+r3]
+ movdqu xmm4, [r13+r3*2]
+ movdqu xmm5, [r13+r6]
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- paddw xmm2, xmm4
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ paddw xmm2, xmm4
- paddw xmm1, xmm2
- movdqa xmm2, xmm1
+ paddw xmm1, xmm2
+ movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
paddw xmm1, xmm2
- movd r2d, xmm1
- mov [r4], r2w
- inc dword [r5+r2*4]
+ movd r2d, xmm1
+ mov [r4], r2w
+ inc dword [r5+r2*4]
- inc r0
- lea r4, [r4+2]
+ inc r0
+ lea r4, [r4+2]
- dec r12
- jg near FIRST_ROW_X16H
+ dec r12
+ jg near FIRST_ROW_X16H
pop r4
pop r2
@@ -1202,38 +1202,38 @@
pop r0
mov r13, r2
dec r13
- mov r6, r3
- sal r6, 4 ; succeeded 16th line
+ mov r6, r3
+ sal r6, 4 ; succeeded 16th line
HEIGHT_LOOP_X16:
- mov r12, r1
+ mov r12, r1
WIDTH_LOOP_X16:
- movdqu xmm1, [r0+r6]
- movdqu xmm2, [r0]
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psubw xmm1, xmm2
- movdqa xmm2, xmm1
+ movdqu xmm1, [r0+r6]
+ movdqu xmm2, [r0]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psubw xmm1, xmm2
+ movdqa xmm2, xmm1
punpckhwd xmm2, xmm0
- paddw xmm1, xmm2
- movd r2d, xmm1
- add r2w, word [r4]
- mov [r4+r1*2], r2w
- inc dword [r5+r2*4]
+ paddw xmm1, xmm2
+ movd r2d, xmm1
+ add r2w, word [r4]
+ mov [r4+r1*2], r2w
+ inc dword [r5+r2*4]
- inc r0
- add r4, 2
+ inc r0
+ add r4, 2
- dec r12
- jg near WIDTH_LOOP_X16
+ dec r12
+ jg near WIDTH_LOOP_X16
- add r0, r3
- sub r0, r1
+ add r0, r3
+ sub r0, r1
- dec r13
- jg near HEIGHT_LOOP_X16
+ dec r13
+ jg near HEIGHT_LOOP_X16
- pop r13
- pop r12
+ pop r13
+ pop r12
POP_XMM
LOAD_6_PARA_POP
ret
@@ -1244,20 +1244,20 @@
; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
;-----------------------------------------------------------------------------------------------------------------------------
; try 8 mv via offset
-%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
- movdqu %2, [%1]
- movdqu %3, [%1+8h]
- movdqa %4, %2
- movdqa %5, %3
+%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1
+ movdqu %2, [%1]
+ movdqu %3, [%1+8h]
+ movdqa %4, %2
+ movdqa %5, %3
- mpsadbw %2, xmm0, 0 ; 000 B
- mpsadbw %4, xmm0, 5 ; 101 B
- mpsadbw %3, xmm0, 2 ; 010 B
- mpsadbw %5, xmm0, 7 ; 111 B
- paddw %2, %4
- paddw %3, %5
- paddw %2, %3 ; accumulate cost
-%endmacro ; end of SAD_16x16_LINE_SSE41
+ mpsadbw %2, xmm0, 0 ; 000 B
+ mpsadbw %4, xmm0, 5 ; 101 B
+ mpsadbw %3, xmm0, 2 ; 010 B
+ mpsadbw %5, xmm0, 7 ; 111 B
+ paddw %2, %4
+ paddw %3, %5
+ paddw %2, %3 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41
WELS_EXTERN SumOf16x16BlockOfFrame_sse4
%assign push_num 0
@@ -1272,68 +1272,68 @@
push r2
push r4
- pxor xmm0, xmm0
+ pxor xmm0, xmm0
lea r6, [r3+r3*2]
- mov r12, r1 ;r12:tmp_width
+ mov r12, r1 ;r12:tmp_width
FIRST_ROW_X16_SSE4:
- SUM_LINE_X16_SSE41 r0, xmm1, xmm2, xmm3, xmm4
- SUM_LINE_X16_SSE41 r0+r3, xmm2, xmm3, xmm4, xmm5
- SUM_LINE_X16_SSE41 r0+r3*2,xmm3, xmm4, xmm5, xmm6
- SUM_LINE_X16_SSE41 r0+r6, xmm4, xmm5, xmm6, xmm7
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm1, xmm3
+ SUM_LINE_X16_SSE41 r0, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 r0+r3, xmm2, xmm3, xmm4, xmm5
+ SUM_LINE_X16_SSE41 r0+r3*2,xmm3, xmm4, xmm5, xmm6
+ SUM_LINE_X16_SSE41 r0+r6, xmm4, xmm5, xmm6, xmm7
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm1, xmm3
- lea r13, [r0+r3*4]
- SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
+ lea r13, [r0+r3*4]
+ SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
- lea r13, [r13+r3*4]
- SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
+ lea r13, [r13+r3*4]
+ SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
- lea r13, [r13+r3*4]
- SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
- SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
- paddw xmm1, xmm2
+ lea r13, [r13+r3*4]
+ SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
+ SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5
+ paddw xmm1, xmm2
- movdqa [r4], xmm1
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm0
- punpckhwd xmm2, xmm0
+ movdqa [r4], xmm1
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
- COUNT_SUM xmm1, r2d, r2, 1
- COUNT_SUM xmm1, r2d, r2, 1
- COUNT_SUM xmm1, r2d, r2, 1
- COUNT_SUM xmm1, r2d, r2, 0
- COUNT_SUM xmm2, r2d, r2, 1
- COUNT_SUM xmm2, r2d, r2, 1
- COUNT_SUM xmm2, r2d, r2, 1
- COUNT_SUM xmm2, r2d, r2, 0
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 1
+ COUNT_SUM xmm1, r2d, r2, 0
+ COUNT_SUM xmm2, r2d, r2, 1
+ COUNT_SUM xmm2, r2d, r2, 1
+ COUNT_SUM xmm2, r2d, r2, 1
+ COUNT_SUM xmm2, r2d, r2, 0
- lea r0, [r0+8]
- lea r4, [r4+16] ; element size is 2
+ lea r0, [r0+8]
+ lea r4, [r4+16] ; element size is 2
- sub r12, 8
- jg near FIRST_ROW_X16_SSE4
+ sub r12, 8
+ jg near FIRST_ROW_X16_SSE4
pop r4
pop r2
@@ -1340,47 +1340,47 @@
pop r0
mov r13, r2
dec r13
- mov r6, r3
- sal r6, 4 ; succeeded 16th line
+ mov r6, r3
+ sal r6, 4 ; succeeded 16th line
HEIGHT_LOOP_X16_SSE4:
- mov r12, r1
+ mov r12, r1
WIDTH_LOOP_X16_SSE4:
- movdqa xmm7, [r4]
- SUM_LINE_X16_SSE41 r0+r6, xmm1, xmm2, xmm3, xmm4
- SUM_LINE_X16_SSE41 r0, xmm2, xmm3, xmm4, xmm5
+ movdqa xmm7, [r4]
+ SUM_LINE_X16_SSE41 r0+r6, xmm1, xmm2, xmm3, xmm4
+ SUM_LINE_X16_SSE41 r0, xmm2, xmm3, xmm4, xmm5
- paddw xmm7, xmm1
- psubw xmm7, xmm2
- movdqa [r4+r1*2], xmm7
+ paddw xmm7, xmm1
+ psubw xmm7, xmm2
+ movdqa [r4+r1*2], xmm7
- movdqa xmm6, xmm7
- punpcklwd xmm7, xmm0
- punpckhwd xmm6, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm6, xmm0
- COUNT_SUM xmm7, r2d, r2, 1
- COUNT_SUM xmm7, r2d, r2, 1
- COUNT_SUM xmm7, r2d, r2, 1
- COUNT_SUM xmm7, r2d, r2, 0
- COUNT_SUM xmm6, r2d, r2, 1
- COUNT_SUM xmm6, r2d, r2, 1
- COUNT_SUM xmm6, r2d, r2, 1
- COUNT_SUM xmm6, r2d, r2, 0
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 1
+ COUNT_SUM xmm7, r2d, r2, 0
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 1
+ COUNT_SUM xmm6, r2d, r2, 0
- lea r0, [r0+8]
- lea r4, [r4+16]
+ lea r0, [r0+8]
+ lea r4, [r4+16]
- sub r12, 8
- jg near WIDTH_LOOP_X16_SSE4
+ sub r12, 8
+ jg near WIDTH_LOOP_X16_SSE4
- add r0, r3
- sub r0, r1
+ add r0, r3
+ sub r0, r1
- dec r13
- jg near HEIGHT_LOOP_X16_SSE4
+ dec r13
+ jg near HEIGHT_LOOP_X16_SSE4
- pop r13
- pop r12
+ pop r13
+ pop r12
POP_XMM
LOAD_6_PARA_POP
ret
@@ -1398,48 +1398,48 @@
push r13
mov r12, r2
- movq xmm7, [mv_x_inc_x4] ; x_qpel inc
- movq xmm6, [mv_y_inc_x4] ; y_qpel inc
- movq xmm5, [mx_x_offset_x4] ; x_qpel vector
- pxor xmm4, xmm4
- pxor xmm3, xmm3 ; y_qpel vector
+ movq xmm7, [mv_x_inc_x4] ; x_qpel inc
+ movq xmm6, [mv_y_inc_x4] ; y_qpel inc
+ movq xmm5, [mx_x_offset_x4] ; x_qpel vector
+ pxor xmm4, xmm4
+ pxor xmm3, xmm3 ; y_qpel vector
HASH_HEIGHT_LOOP_SSE2:
- movdqa xmm2, xmm5 ; x_qpel vector
- mov r4, r1
+ movdqa xmm2, xmm5 ; x_qpel vector
+ mov r4, r1
HASH_WIDTH_LOOP_SSE2:
- movq xmm0, [r0] ; load x8 sum
- punpcklwd xmm0, xmm4
- movdqa xmm1, xmm2
- punpcklwd xmm1, xmm3
-%rep 3
- movd r2d, xmm0 ;edx:r3
- lea r5, [r3+r2*8] ;ebx:r5
- mov r6, [r5] ;eax:r6
- movd [r6], xmm1
- mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
- lea r6, [r6+4]
- mov [r5], r6
- psrldq xmm1, 4
- psrldq xmm0, 4
+ movq xmm0, [r0] ; load x8 sum
+ punpcklwd xmm0, xmm4
+ movdqa xmm1, xmm2
+ punpcklwd xmm1, xmm3
+%rep 3
+ movd r2d, xmm0 ;edx:r3
+ lea r5, [r3+r2*8] ;ebx:r5
+ mov r6, [r5] ;eax:r6
+ movd [r6], xmm1
+ mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea r6, [r6+4]
+ mov [r5], r6
+ psrldq xmm1, 4
+ psrldq xmm0, 4
%endrep
- movd r2d, xmm0
- lea r5, [r3+r2*8] ;ebx:r5
- mov r6, [r5] ;eax:r6
- movd [r6], xmm1
- mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
- lea r6, [r6+4]
- mov [r5], r6
+ movd r2d, xmm0
+ lea r5, [r3+r2*8] ;ebx:r5
+ mov r6, [r5] ;eax:r6
+ movd [r6], xmm1
+ mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation
+ lea r6, [r6+4]
+ mov [r5], r6
- paddw xmm2, xmm7
- lea r0, [r0+8]
- sub r4, 4
+ paddw xmm2, xmm7
+ lea r0, [r0+8]
+ sub r4, 4
jnz near HASH_WIDTH_LOOP_SSE2
- paddw xmm3, xmm6
- dec r12
- jnz near HASH_HEIGHT_LOOP_SSE2
+ paddw xmm3, xmm6
+ dec r12
+ jnz near HASH_HEIGHT_LOOP_SSE2
- pop r13
- pop r12
+ pop r13
+ pop r12
POP_XMM
ret
@@ -1455,69 +1455,69 @@
push r12
push r13
mov r12, r2
- sar r2, 2
- mov r5, 0 ;r5:ecx
+ sar r2, 2
+ mov r5, 0 ;r5:ecx
xor r6, r6
- pxor xmm3, xmm3
+ pxor xmm3, xmm3
hash_assign_loop_x4_sse2:
- movdqa xmm0, [r0+r5]
- pslld xmm0, 2
+ movdqa xmm0, [r0+r5]
+ pslld xmm0, 2
- movdqa xmm1, xmm0
- pcmpeqd xmm1, xmm3
- movmskps r6, xmm1
+ movdqa xmm1, xmm0
+ pcmpeqd xmm1, xmm3
+ movmskps r6, xmm1
cmp r6, 0x0f
- jz near hash_assign_with_copy_sse2
+ jz near hash_assign_with_copy_sse2
-%assign x 0
+%assign x 0
%rep 4
- lea r13, [r3+r5*2+x]
- mov [r13], r1
- lea r13, [r4+r5*2+x]
- mov [r13], r1
- movd r6d, xmm0
- add r1, r6
- psrldq xmm0, 4
-%assign x x+8
+ lea r13, [r3+r5*2+x]
+ mov [r13], r1
+ lea r13, [r4+r5*2+x]
+ mov [r13], r1
+ movd r6d, xmm0
+ add r1, r6
+ psrldq xmm0, 4
+%assign x x+8
%endrep
jmp near assign_next_sse2
hash_assign_with_copy_sse2:
- movq xmm1, r1
- pshufd xmm2, xmm1, 01000100b
- movdqa [r3+r5*2], xmm2
- movdqa [r4+r5*2], xmm2
- movdqa [r3+r5*2+16], xmm2
- movdqa [r4+r5*2+16], xmm2
+ movq xmm1, r1
+ pshufd xmm2, xmm1, 01000100b
+ movdqa [r3+r5*2], xmm2
+ movdqa [r4+r5*2], xmm2
+ movdqa [r3+r5*2+16], xmm2
+ movdqa [r4+r5*2+16], xmm2
assign_next_sse2:
- add r5, 16
- dec r2
- jnz near hash_assign_loop_x4_sse2
+ add r5, 16
+ dec r2
+ jnz near hash_assign_loop_x4_sse2
- and r12, 3
- jz near hash_assign_no_rem_sse2
+ and r12, 3
+ jz near hash_assign_no_rem_sse2
hash_assign_loop_x4_rem_sse2:
- lea r13, [r3+r5*2]
- mov [r13], r1
- lea r13, [r4+r5*2]
- mov [r13], r1
- mov r6d, [r0+r5]
- sal r6, 2
- add r1, r6
- add r5, 4
- dec r12
- jnz near hash_assign_loop_x4_rem_sse2
+ lea r13, [r3+r5*2]
+ mov [r13], r1
+ lea r13, [r4+r5*2]
+ mov [r13], r1
+ mov r6d, [r0+r5]
+ sal r6, 2
+ add r1, r6
+ add r5, 4
+ dec r12
+ jnz near hash_assign_loop_x4_rem_sse2
hash_assign_no_rem_sse2:
pop r13
- pop r12
+ pop r12
ret
%endif
;**********************************************************************************************************************************
-; int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+; int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
;**********************************************************************************************************************************
WELS_EXTERN SumOf8x8SingleBlock_sse2
%assign push_num 0
@@ -1553,7 +1553,7 @@
ret
;**********************************************************************************************************************************
-; int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
+; int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize)
;**********************************************************************************************************************************
WELS_EXTERN SumOf16x16SingleBlock_sse2
%assign push_num 0