ref: 57fc3e991792ea277a309bcc9351bb800d46b380
parent: eb9f56584fae81eab9be6ab999040ed5e4a7cfcd
author: Sindre Aamås <[email protected]>
date: Fri Apr 8 13:05:38 EDT 2016
[Processing] Add AVX2 VAA routines Process 8 lines at a time rather than 16 lines at a time because this appears to give more reliable memory subsystem performance on Haswell. Speedup is > 2x as compared to SSE2 when not memory-bound on Haswell. On my Haswell MBP, VAACalcSadSsdBgd is about ~3x faster when uncached, which appears to be related to processing 8 lines at a time as opposed to 16 lines at a time. The other routines are also faster as compared to the SSE2 routines in this case but to a lesser extent.
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -64,6 +64,13 @@
sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
}
+ if (iCpuFlag & WELS_CPU_AVX2) {
+ sVaaFuncs.pfVAACalcSad = VAACalcSad_avx2;
+ sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_avx2;
+ sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_avx2;
+ sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_avx2;
+ sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_avx2;
+ }
#endif//X86_ASM
#ifdef HAVE_NEON
if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
--- a/codec/processing/src/vaacalc/vaacalculation.h
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -104,6 +104,11 @@
VAACalcSadFunc VAACalcSad_sse2;
VAACalcSadVarFunc VAACalcSadVar_sse2;
VAACalcSadSsdFunc VAACalcSadSsd_sse2;
+VAACalcSadBgdFunc VAACalcSadBgd_avx2;
+VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_avx2;
+VAACalcSadFunc VAACalcSad_avx2;
+VAACalcSadVarFunc VAACalcSadVar_avx2;
+VAACalcSadSsdFunc VAACalcSadSsd_avx2;
WELSVP_EXTERN_C_END
#endif
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -2028,3 +2028,1532 @@
%undef localsize
ret
%endif
+
+%ifdef X86_32
+%define ptrword dword
+%else
+%define ptrword qword
+%endif
+
+%define xmm_width 16
+%define ymm_width 32
+
+%macro PUSHM 1-*
+ %rep %0
+ push %1
+ %rotate 1
+ %endrep
+ %assign push_num push_num + %0
+%endmacro
+
+%macro POPM 1-*
+ %rep %0
+ %rotate -1
+ pop %1
+ %endrep
+ %assign push_num push_num - %0
+%endmacro
+
+%ifdef X86_32
+%define stack_alloc_min 4
+%else
+%define stack_alloc_min 8
+%endif
+
+; Allocate aligned stack space.
+; address_out=%1 size=%2 alignment=%3
+%macro STACK_ALLOC 3
+%if (%3) & ((%3) - 1)
+ %error non-power-of-2 alignment requested.
+%endif
+%if (%3) > 0
+ %assign stack_alloc_align ((%3) + stack_alloc_min - 1) / stack_alloc_min
+%else
+ %assign stack_alloc_align 1
+%endif
+ %assign stack_alloc_num ((%2) + stack_alloc_min - 1) / stack_alloc_min + stack_alloc_align - 1
+ %assign push_num push_num + stack_alloc_num
+ sub r7, stack_alloc_min * stack_alloc_num
+%if stack_alloc_align == 1
+ mov %1, r7
+%else
+ lea %1, [r7 + stack_alloc_min * (stack_alloc_align - 1)]
+ and %1, -(stack_alloc_min * stack_alloc_align)
+%endif
+%endmacro
+
+; Deallocate stack space allocated with STACK_ALLOC.
+%macro STACK_DEALLOC 0
+ add r7, stack_alloc_min * stack_alloc_num
+ %assign push_num push_num - stack_alloc_num
+%endmacro
+
+; Max unsigned byte per quadword
+; out=%1 in=%2 tmp=%3
+%macro AVX2_Maxubq 3
+ vpsrlq %3, %2, 32
+ vpmaxub %1, %2, %3
+ vpsrlq %3, %1, 16
+ vpmaxub %1, %1, %3
+ vpsrlq %3, %1, 8
+ vpmaxub %1, %1, %3
+%endmacro
+
+; Max unsigned byte per quadword. 2 register input.
+; Results interleaved as least significant byte of even/odd doublewords.
+; out=%1 in_a=%2 in_b=%3 tmp=%4
+%macro AVX2_Maxubq2 4
+ vpblendd %4, %2, %3, 10101010b
+ vpshufd %4, %4, 10110001b
+ vpblendd %1, %2, %3, 01010101b
+ vpmaxub %1, %4, %1
+ vpsrld %4, %1, 16
+ vpmaxub %1, %1, %4
+ vpsrld %4, %1, 8
+ vpmaxub %1, %1, %4
+%endmacro
+
+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
+%macro AVX2_Sqsumbdw 5
+ vpunpcklbw %4, %2, %3
+%if %5
+ vpmaddwd %4, %4, %4
+ vpaddd %1, %1, %4
+%else
+ vpmaddwd %1, %4, %4
+%endif
+ vpunpckhbw %4, %2, %3
+ vpmaddwd %4, %4, %4
+ vpaddd %1, %1, %4
+%endmacro
+
+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
+%macro AVX2_Sumbdw 5
+%if %5
+ vpsadbw %4, %2, %3
+ vpaddd %1, %1, %4
+%else
+ vpsadbw %1, %2, %3
+%endif
+%endmacro
+
+; res=%1 a=%2 b=%3 a=%4 tmp=%5
+%macro AVX2_AbsDiffub 5
+ vpsubusb %5, %2, %3
+ vpsubusb %1, %3, %4
+ vpor %1, %5, %1
+%endmacro
+
+; sad=%1 cur_data=%2 ref_data=%3 tmp=%4 accumulate_results=%5
+%macro AVX2_Sadbdw 5
+%if %5
+ vpsadbw %4, %2, %3
+ vpaddd %1, %1, %4
+%else
+ vpsadbw %1, %2, %3
+%endif
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 cur_data=%4 ref_data=%5 zero=%6 tmp=%7 accumulate_results=%8
+%macro AVX2_SadSumSqsumbdw 8
+ AVX2_Sadbdw %1, %4, %5, %7, %8
+ AVX2_Sumbdw %2, %4, %6, %7, %8
+ AVX2_Sqsumbdw %3, %4, %6, %7, %8
+%endmacro
+
+; sad=%1 pCur=%2 pRef=%3 tmp=%4 accumulate_results=%5
+%macro AVX2_Sad 5
+ vmovdqu %4, [%2]
+ AVX2_Sadbdw %1, %4, [%3], %4, %5
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 pCur=%4 pRef=%5 zero=%6 tmp=%7,%8 accumulate_results=%9
+%macro AVX2_SadSumSqsum 9
+ vmovdqu %7, [%4]
+ AVX2_SadSumSqsumbdw %1, %2, %3, %7, [%5], %6, %8, %9
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 sqdiff=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
+%macro AVX2_SadSumSqsumSqdiff 11
+ vmovdqu %8, [%5]
+ vmovdqu %9, [%6]
+ AVX2_SadSumSqsumbdw %1, %2, %3, %8, %9, %7, %10, %11
+ AVX2_AbsDiffub %9, %8, %9, %8, %10
+ AVX2_Sqsumbdw %4, %9, %7, %10, %11
+%endmacro
+
+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
+%macro AVX2_SadSdMad 11
+ vmovdqu %8, [%5]
+ vmovdqu %9, [%6]
+ AVX2_Sumbdw %2, %8, %7, %10, %11
+ AVX2_Sumbdw %3, %9, %7, %10, %11
+ AVX2_Sadbdw %1, %8, %9, %10, %11
+%if %11
+ AVX2_AbsDiffub %9, %8, %9, %8, %10
+ vpmaxub %4, %4, %9
+%else
+ AVX2_AbsDiffub %4, %8, %9, %8, %10
+%endif
+%endmacro
+
+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 sqdiff=%5 sqsum_cur=%6 pCur=%7 pRef=%8 zero=%9 tmp=%10,%11,%12 accumulate_results=%13
+%macro AVX2_SadBgdSqdiff 13
+%ifidn %12, 0
+ vmovdqu %10, [%7]
+ AVX2_Sumbdw %2, %10, %9, %11, %13
+ AVX2_Sqsumbdw %6, %10, %9, %11, %13
+ vmovdqu %11, [%8]
+ AVX2_Sadbdw %1, %10, %11, %10, %13
+ AVX2_Sumbdw %3, %11, %9, %10, %13
+ vmovdqu %10, [%7]
+%if %13
+ AVX2_AbsDiffub %11, %10, %11, [%7], %10
+ vpmaxub %4, %4, %11
+ AVX2_Sqsumbdw %5, %11, %9, %10, %13
+%else
+ AVX2_AbsDiffub %4, %10, %11, [%7], %10
+ AVX2_Sqsumbdw %5, %4, %9, %10, %13
+%endif
+%else
+ vmovdqu %10, [%7]
+ vmovdqu %11, [%8]
+ AVX2_Sadbdw %1, %10, %11, %12, %13
+ AVX2_Sumbdw %2, %10, %9, %12, %13
+ AVX2_Sumbdw %3, %11, %9, %12, %13
+ AVX2_Sqsumbdw %6, %10, %9, %12, %13
+%if %13
+ AVX2_AbsDiffub %11, %10, %11, %10, %12
+ vpmaxub %4, %4, %11
+ AVX2_Sqsumbdw %5, %11, %9, %10, %13
+%else
+ AVX2_AbsDiffub %4, %10, %11, %10, %12
+ AVX2_Sqsumbdw %5, %4, %9, %10, %13
+%endif
+%endif
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
+%macro AVX2_Store8x8Accdw 5
+ vpshufd %2%4, %2%3, 1000b
+%ifidni %2, x
+ vmovlps [%1 + 8 * %5], x%4
+%elif %5 == 0
+ vmovdqu [%1], %2%4
+%else
+ vmovlps [%1 + 8], x%4
+ vextracti128 x%4, %2%4, 1
+ vmovlps [%1 + 24], x%4
+%endif
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
+%macro AVX2_Store8x8Accb 5
+ vpunpckhqdq %2%4, %2%3, %2%3
+ vpunpcklbw %2%4, %2%3, %2%4
+%if %5 == 0
+ vmovd [%1 + 0], x%4
+%ifidni %2, y
+ vextracti128 x%4, %2%4, 1
+ vmovd [%1 + 4], x%4
+%endif
+%else
+ vpextrw [%1 + 2], x%4, 0
+%ifidni %2, y
+ vextracti128 x%4, %2%4, 1
+ vpextrw [%1 + 6], x%4, 0
+%endif
+%endif
+%endmacro
+
+; p_dst=%1 data=%2 tmp=%3,%4 second_blocks=%5
+%macro AVX2_Store2x8x8Accb 5
+ vpunpckhqdq y%3, y%2, y%2
+ vpunpcklbw y%3, y%2, y%3
+ vextracti128 x%4, y%3, 1
+ vpsllq x%4, x%4, 32
+ vpblendd x%4, x%3, x%4, 1010b
+%if %5
+ vpslld x%4, x%4, 16
+ vpblendw x%4, x%4, [%1], 01010101b
+%endif
+ vmovdqu [%1], x%4
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 add_to_dst=%5
+%macro AVX2_Store16x16Accdw 5
+%ifidni %2, x
+%if %5
+ vmovd x%4, [%1 + 0]
+ vpaddd x%3, x%4, x%3
+%endif
+ vmovd [%1 + 0], x%3
+%elif %5 == 0
+ vmovd [%1 + 0], x%3
+ vextracti128 x%3, %2%3, 1
+ vmovd [%1 + 4], x%3
+%else
+ vextracti128 x%4, %2%3, 1
+ vpunpckldq x%4, x%3, x%4
+ vmovq x%3, [%1 + 0]
+ vpaddd x%3, x%3, x%4
+ vmovlps [%1 + 0], x%3
+%endif
+%endmacro
+
+; p_dst1=%1 p_dst2=%2 i_dst_offset=%3 gpr_tmp=%4 mmreg_prefix=%5 data=%6 mm_tmp=%7 add_to_dst=%8
+%macro AVX2_Store2x16x16Accdw 8
+%ifidni %5, x
+ mov %4, %1
+%if %8 == 0
+ vmovd [%4 + %3], x%6
+ mov %4, %2
+ vpextrd [%4 + %3], x%6, 2
+%else
+ vmovd x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vmovd [%4 + %3], x%7
+ mov %4, %2
+ vpbroadcastd x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vpextrd [%4 + %3], x%7, 2
+%endif
+%else
+ vextracti128 x%7, %5%6, 1
+ vpblendd x%6, x%6, x%7, 1010b
+ mov %4, %1
+%if %8 == 0
+ vmovlps [%4 + %3], x%6
+ mov %4, %2
+ vmovhps [%4 + %3], x%6
+%else
+ vmovq x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vmovlps [%4 + %3], x%7
+ mov %4, %2
+ vpbroadcastq x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vmovhps [%4 + %3], x%7
+%endif
+%endif
+%endmacro
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
+%macro AVX2_CalcSad_8Lines 7
+%define mm_tmp0 %2
+%define mm_sad %3
+%define mm_sad2 %4
+%define mm_sad3 %5
+%define mm_sad4 %6
+%define b_second_blocks %7
+%ifdef i_stride5
+ %define i_stride5_ i_stride5
+%else
+ lea r_tmp, [5 * i_stride]
+ %define i_stride5_ r_tmp
+%endif
+ ; Use multiple accumulators to shorten dependency chains and enable more parallelism.
+ AVX2_Sad %1 %+ mm_sad, p_cur, p_ref, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_tmp0, 1
+ AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_tmp0, 1
+%ifdef i_stride7
+ %define i_stride7_ i_stride7
+%else
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ %define i_stride7_ r_tmp
+%endif
+ AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_tmp0, 1
+ AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride7_, p_ref + 1 * i_stride7_, %1 %+ mm_tmp0, 1
+%undef i_stride5_
+%undef i_stride7_
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+ ; Collapse accumulators.
+ vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad2
+ vpaddd %1 %+ mm_sad3, %1 %+ mm_sad3, %1 %+ mm_sad4
+ vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad3
+ AVX2_Store8x8Accdw p_sad8x8 + xcnt_unit * i_xcnt, %1, mm_sad, mm_tmp0, b_second_blocks
+ vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
+%undef mm_tmp0
+%undef mm_sad
+%undef mm_sad2
+%undef mm_sad3
+%undef mm_sad4
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSad_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSad_avx2
+%define p_sadframe ptrword arg6
+%define p_sad8x8 ptrword arg7
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+%define mm_sadframe mm6
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+ vmovdqa y %+ mm_sadframe, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ mov r5, p_sad8x8
+ %define i_stride3 r3
+ %undef p_sad8x8
+ %define p_sad8x8 r5
+ %define r_tmp r6
+ lea i_stride3, [3 * i_stride]
+%else
+ mov rbp, p_sad8x8
+ %define i_stride3 rbx
+ %define i_stride5 r12
+ %define i_stride7 r6
+ %undef p_sad8x8
+ %define p_sad8x8 rbp
+ lea i_stride3, [3 * i_stride]
+ lea i_stride5, [5 * i_stride]
+ lea i_stride7, [i_stride + 2 * i_stride3]
+%endif
+
+ ; offset pointer so as to compensate for the i_xcnt offset below.
+ sub p_sad8x8, 4 * 16 / xcnt_unit
+
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+
+.height_loop:
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea p_sad8x8, [p_sad8x8 + xcnt_unit * i_xcnt]
+ ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_upper8_remaining16
+.width_loop_upper8:
+ AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+ AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ xor i_xcnt, i_xcnt
+ sub i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_lower8_remaining16
+.width_loop_lower8:
+ AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+ AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ xor i_xcnt, i_xcnt
+ sub i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ neg i_xcnt
+ sub i_ycnt, 1
+ jnz .height_loop
+
+ pop i_xcnt
+%assign push_num push_num - 1
+%undef i_xcnt_load
+
+.done:
+ mov r6, p_sadframe
+ vextracti128 xmm2, y %+ mm_sadframe, 1
+ vpaddd xmm2, x %+ mm_sadframe, xmm2
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r6], xmm2
+ vzeroupper
+
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef r_tmp
+%undef xcnt_unit
+%undef i_stride3
+%undef i_stride5
+%undef i_stride7
+%undef mm_sadframe
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
+%macro AVX2_CalcSadVar_8Lines 7
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_sad %4
+%define mm_sum %5
+%define mm_sqsum %6
+%define b_second_blocks %7
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+ lea r_tmp, [5 * i_stride]
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 0
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+ vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ AVX2_Store8x8Accdw p_sad8x8 + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+ vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
+ vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_sqsum
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sqsum
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ AVX2_Store2x16x16Accdw p_sum16x16, p_sqsum16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_sad
+%undef mm_sum
+%undef mm_sqsum
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadVar_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadVar_avx2
+%define p_sadframe ptrword arg6
+%define p_sad8x8 ptrword arg7
+%define p_sum16x16 ptrword arg8
+%define p_sqsum16x16 ptrword arg9
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+%define mm_sadframe mm6
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+ vmovdqa y %+ mm_sadframe, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ mov r3, p_sad8x8
+ %undef p_sad8x8
+ %define p_sad8x8 r3
+ %define i_stride3 r5
+%else
+ mov rbp, p_sad8x8
+ mov r12, p_sum16x16
+ mov r13, p_sqsum16x16
+ %undef p_sad8x8
+ %undef p_sum16x16
+ %undef p_sqsum16x16
+ %define p_sad8x8 rbp
+ %define p_sum16x16 r12
+ %define p_sqsum16x16 r13
+ %define i_stride3 rbx
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers so as to compensate for the i_xcnt offset below.
+ sub p_sad8x8, 4 * 16 / xcnt_unit
+ sub p_sum16x16, 1 * 16 / xcnt_unit
+ sub p_sqsum16x16, 1 * 16 / xcnt_unit
+
+ ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_sad8x8, r_tmp
+ sub p_sum16x16, i_xcnt
+ sub p_sqsum16x16, i_xcnt
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_upper8_remaining16
+.width_loop_upper8:
+ AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+ AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_lower8_remaining16
+.width_loop_lower8:
+ AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+ AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+%undef i_xcnt_load
+ pop i_xcnt
+ %assign push_num push_num - 1
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+ vextracti128 xmm2, y %+ mm_sadframe, 1
+ vpaddd xmm2, x %+ mm_sadframe, xmm2
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef r_tmp
+%undef xcnt_unit
+%undef mm_sadframe
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sum16x16
+%undef p_sqsum16x16
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
+%macro AVX2_CalcSadSsd_8Lines 9
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_tmp2 %4
+%define mm_sad %5
+%define mm_sum %6
+%define mm_sqsum %7
+%define mm_sqdiff %8
+%define b_second_blocks %9
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+%ifdef i_stride5
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ %define i_stride5_ i_stride5
+%else
+ lea r_tmp, [5 * i_stride]
+ %define i_stride5_ r_tmp
+%endif
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+%ifndef i_stride5
+ lea r_tmp, [i_stride + 2 * i_stride3]
+%endif
+%undef i_stride5_
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+ vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+ vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ mov r_tmp, p_sad8x8
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+ vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+ vmovdqa sadframe_acc, y %+ mm_tmp1
+%else
+ vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+ mov r_tmp, i_xcnt
+ add r_tmp, p_sum16x16
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
+ AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+ vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_tmp2
+%undef mm_sad
+%undef mm_sum
+%undef mm_sqsum
+%undef mm_sqdiff
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadSsd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadSsd_avx2
+%define p_sadframe ptrword arg6
+%define p_sad8x8 ptrword arg7
+%define p_sum16x16 ptrword arg8
+%define p_sqsum16x16 ptrword arg9
+%define p_sqdiff16x16 ptrword arg10
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13, r14, r15
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 9
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+
+%ifdef X86_32
+ STACK_ALLOC r5, ymm_width, ymm_width
+ %define sadframe_acc_addr r5
+ %define sadframe_acc [sadframe_acc_addr]
+%else
+ %define sadframe_acc ymm8
+ %define xsadframe_acc xmm8
+%endif
+ vmovdqa sadframe_acc, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ %define i_stride3 r3
+%else
+ mov r12, p_sad8x8
+ mov r13, p_sum16x16
+ mov r14, p_sqsum16x16
+ mov r15, p_sqdiff16x16
+ %undef p_sad8x8
+ %undef p_sum16x16
+ %undef p_sqsum16x16
+ %undef p_sqdiff16x16
+ %define p_sad8x8 r12
+ %define p_sum16x16 r13
+ %define p_sqsum16x16 r14
+ %define p_sqdiff16x16 r15
+ %define i_stride3 rbx
+ %define i_stride5 rbp
+ lea i_stride5, [5 * i_stride]
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers so as to compensate for i_xcnt offset below.
+ sub p_sad8x8, 4 * 16 / xcnt_unit
+ sub p_sum16x16, 1 * 16 / xcnt_unit
+ sub p_sqsum16x16, 1 * 16 / xcnt_unit
+ sub p_sqdiff16x16, 1 * 16 / xcnt_unit
+
+ ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_sad8x8, r_tmp
+ sub p_sum16x16, i_xcnt
+ sub p_sqsum16x16, i_xcnt
+ sub p_sqdiff16x16, i_xcnt
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_upper8_remaining16
+.width_loop_upper8:
+ AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+ AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_lower8_remaining16
+.width_loop_lower8:
+ AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+ AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+%undef i_xcnt_load
+ pop i_xcnt
+ %assign push_num push_num - 1
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+%ifdef X86_32
+ vmovdqa xmm2, sadframe_acc
+ vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+ vextracti128 xmm2, sadframe_acc, 1
+ vpaddd xmm2, xsadframe_acc, xmm2
+%endif
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+%ifdef X86_32
+ STACK_DEALLOC
+%endif
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef i_stride5
+%undef r_tmp
+%undef xcnt_unit
+%undef sadframe_acc
+%undef sadframe_acc_addr
+%undef xsadframe_acc
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sum16x16
+%undef p_sqsum16x16
+%undef p_sqdiff16x16
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
+%macro AVX2_CalcSadBgd_8Lines 9
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_tmp2 %8
+%define mm_mad %4
+%define mm_sumcur %5
+%define mm_sumref %6
+%define mm_sad %7
+%define b_second_blocks %9
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+ lea r_tmp, [5 * i_stride]
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sumcur, x %+ mm_sumcur, x %+ mm_sumcur
+ vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
+ vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ mov r_tmp, p_sad8x8
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+ vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+ vmovdqa sadframe_acc, y %+ mm_tmp1
+%else
+ vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+ mov r_tmp, p_sd8x8
+ vpsubd %1 %+ mm_tmp0, %1 %+ mm_sumcur, %1 %+ mm_sumref
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_tmp0, mm_tmp1, b_second_blocks
+ ; Coalesce store and horizontal reduction of MAD accumulator for even and
+ ; odd iterations so as to enable more parallelism.
+%ifidni %1, y
+ test i_xcnt, 32 / xcnt_unit
+ jz %%preserve_mad
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
+ AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
+%%preserve_mad:
+ vmovdqa prev_mad, y %+ mm_mad
+%else
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
+ AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
+%endif
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_tmp2
+%undef mm_mad
+%undef mm_sumcur
+%undef mm_sumref
+%undef mm_sad
+%undef b_second_blocks
+%endmacro
+
+; Store remaining MAD accumulator for width & 32 cases.
+; width/xcnt_unit=%1 mm_tmp=%2,%3 b_second_blocks=%4
+%macro AVX2_StoreRemainingSingleMad 4
+ test %1, 32 / xcnt_unit
+ jz %%skip
+ mov r_tmp, p_mad8x8
+ vmovdqa y%2, prev_mad
+ AVX2_Maxubq y%2, y%2, y%3
+ AVX2_Store8x8Accb r_tmp + i_xcnt - 8, y, %2, %3, %4
+%%skip:
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadBgd_avx2
+%define p_sadframe arg6
+%define p_sad8x8 arg7
+%define p_sd8x8 arg8
+%define p_mad8x8 arg9
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 10
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+
+%ifdef X86_32
+ STACK_ALLOC r5, 2 * ymm_width, ymm_width
+ %define sadframe_acc_addr r5
+ %define sadframe_acc [sadframe_acc_addr]
+ %define prev_mad [r5 + ymm_width]
+%else
+ %define sadframe_acc ymm8
+ %define xsadframe_acc xmm8
+ %define prev_mad ymm9
+%endif
+ vmovdqa sadframe_acc, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ %define i_stride3 r3
+%else
+ mov rbp, p_sad8x8
+ mov r12, p_sd8x8
+ mov r13, p_mad8x8
+ %undef p_sad8x8
+ %undef p_sd8x8
+ %undef p_mad8x8
+ %define p_sad8x8 rbp
+ %define p_sd8x8 r12
+ %define p_mad8x8 r13
+ %define i_stride3 rbx
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers to compensate for the i_xcnt offset below.
+ mov r_tmp, i_xcnt
+ and r_tmp, 64 / xcnt_unit - 1
+ sub p_mad8x8, r_tmp
+ shl r_tmp, 2
+ sub p_sad8x8, r_tmp
+ sub p_sd8x8, r_tmp
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ add p_sad8x8, r_tmp
+ add p_sd8x8, r_tmp
+ add p_mad8x8, i_xcnt
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_upper8_64x_end
+ ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+.width_loop_upper8:
+ AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_32x_end
+.width_loop_upper8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_upper8
+.width_loop_upper8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_upper8_end
+ ; remaining 16.
+ AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_lower8_64x_end
+ neg i_xcnt
+.width_loop_lower8:
+ AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_32x_end
+.width_loop_lower8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_lower8
+.width_loop_lower8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_lower8_end
+ ; remaining 16.
+ AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ pop i_xcnt
+%undef i_xcnt_load
+ %assign push_num push_num - 1
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+%ifdef X86_32
+ vmovdqa xmm2, sadframe_acc
+ vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+ vextracti128 xmm2, sadframe_acc, 1
+ vpaddd xmm2, xsadframe_acc, xmm2
+%endif
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+%ifdef X86_32
+ STACK_DEALLOC
+%endif
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef r_tmp
+%undef xcnt_unit
+%undef sadframe_acc
+%undef sadframe_acc_addr
+%undef xsadframe_acc
+%undef prev_mad
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8,%9,%10 b_second_blocks=%11
+%macro AVX2_CalcSadSsdBgd_8Lines 11
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_sad %4
+%define mm_sum %5
+%define mm_sumref %6
+%define mm_mad %7
+%define mm_sqsum %8
+%define mm_sqdiff %9
+%ifidn %10, 0
+%define tmp2 0
+%else
+%define tmp2 %1 %+ %10
+%endif
+%define b_second_blocks %11
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+ lea r_tmp, [5 * i_stride]
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 0
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+ vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
+ vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
+ vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+ vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ mov r_tmp, p_sad8x8
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+ vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+ vmovdqa sadframe_acc, y %+ mm_tmp1
+%else
+ vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+ mov r_tmp, i_xcnt
+ add r_tmp, p_sum16x16
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
+ AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+ mov r_tmp, p_sd8x8
+ vpsubd %1 %+ mm_sum, %1 %+ mm_sum, %1 %+ mm_sumref
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sum, mm_tmp0, b_second_blocks
+ ; Coalesce store and horizontal reduction of MAD accumulator for even and
+ ; odd iterations so as to enable more parallelism.
+%ifidni %1, y
+ test i_xcnt, 32 / xcnt_unit
+ jz %%preserve_mad
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
+ AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
+%%preserve_mad:
+ vmovdqa prev_mad, y %+ mm_mad
+%else
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
+ AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
+%endif
+ vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_sqsum
+%undef mm_sqdiff
+%undef mm_mad
+%undef mm_sum
+%undef mm_sumref
+%undef mm_sad
+%undef tmp2
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadSsdBgd_avx2
+%define p_sadframe arg6
+%define p_sad8x8 arg7
+%define p_sum16x16 arg8
+%define p_sqsum16x16 arg9
+%define p_sqdiff16x16 arg10
+%define p_sd8x8 arg11
+%define p_mad8x8 arg12
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13, r14, r15
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 12
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%ifdef X86_32
+ STACK_ALLOC r5, 3 * ymm_width, ymm_width
+ %define mm8 0
+ %define sadframe_acc_addr r5
+ %define sadframe_acc [sadframe_acc_addr]
+ %define prev_mad [r5 + ymm_width]
+ %define ymm_zero [r5 + 2 * ymm_width]
+ %define xmm_zero ymm_zero
+ vpxor xmm0, xmm0, xmm0
+ vmovdqa sadframe_acc, ymm0
+ vmovdqa ymm_zero, ymm0
+%else
+ %define sadframe_acc ymm9
+ %define xsadframe_acc xmm9
+ %define prev_mad ymm10
+ %define ymm_zero ymm11
+ %define xmm_zero xmm11
+ vpxor xmm_zero, xmm_zero, xmm_zero
+ vpxor xsadframe_acc, xsadframe_acc, xsadframe_acc
+%endif
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ %define i_stride3 r3
+%else
+ mov rbp, p_sad8x8
+ mov r12, p_sum16x16
+ mov r13, p_sqsum16x16
+ mov r14, p_sqdiff16x16
+ mov r15, p_sd8x8
+ %undef p_sad8x8
+ %undef p_sum16x16
+ %undef p_sqsum16x16
+ %undef p_sqdiff16x16
+ %undef p_sd8x8
+ %define p_sad8x8 rbp
+ %define p_sum16x16 r12
+ %define p_sqsum16x16 r13
+ %define p_sqdiff16x16 r14
+ %define p_sd8x8 r15
+ %define i_stride3 rbx
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers so as to compensate for the i_xcnt offset below.
+ mov r_tmp, i_xcnt
+ and r_tmp, 64 / xcnt_unit - 1
+ sub p_sum16x16, r_tmp
+ sub p_sqsum16x16, r_tmp
+ sub p_sqdiff16x16, r_tmp
+ sub p_mad8x8, r_tmp
+ shl r_tmp, 2
+ sub p_sad8x8, r_tmp
+ sub p_sd8x8, r_tmp
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ add p_sad8x8, r_tmp
+ add p_sum16x16, i_xcnt
+ add p_sqsum16x16, i_xcnt
+ add p_sqdiff16x16, i_xcnt
+ add p_sd8x8, r_tmp
+ add p_mad8x8, i_xcnt
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_upper8_64x_end
+ ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+.width_loop_upper8:
+ AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_32x_end
+.width_loop_upper8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_upper8
+.width_loop_upper8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_upper8_end
+ ; remaining 16.
+ AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_lower8_64x_end
+ neg i_xcnt
+.width_loop_lower8:
+ AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_32x_end
+.width_loop_lower8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_lower8
+.width_loop_lower8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_lower8_end
+ ; remaining 16.
+ AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ pop i_xcnt
+%undef i_xcnt_load
+ %assign push_num push_num - 1
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+%ifdef X86_32
+ vmovdqa xmm2, sadframe_acc
+ vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+ vextracti128 xmm2, sadframe_acc, 1
+ vpaddd xmm2, xsadframe_acc, xmm2
+%endif
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+%ifdef X86_32
+ STACK_DEALLOC
+%endif
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef r_tmp
+%undef xcnt_unit
+%undef mm8
+%undef sadframe_acc
+%undef sadframe_acc_addr
+%undef xsadframe_acc
+%undef prev_mad
+%undef ymm_zero
+%undef xmm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sum16x16
+%undef p_sqsum16x16
+%undef p_sqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+ ret
--- a/test/processing/ProcessUT_VaaCalc.cpp
+++ b/test/processing/ProcessUT_VaaCalc.cpp
@@ -828,6 +828,12 @@
GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_sse2, 1, WELS_CPU_SSE2)
GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_sse2, 1, WELS_CPU_SSE2)
GENERATE_VAACalcSadVar_UT (VAACalcSadVar_sse2, 1, WELS_CPU_SSE2)
+
+GENERATE_VAACalcSad_UT (VAACalcSad_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadVar_UT (VAACalcSadVar_avx2, 1, WELS_CPU_AVX2)
#endif
#if defined(HAVE_NEON)