ref: 56618249d7ba939399679b6c5fd0363b520528e6
parent: 98c6c6de119de1b082df4cb0373d9eaac886a6f6
parent: 93db6511a84a56950390eec7d71f9e3255dd4c75
author: ruil2 <[email protected]>
date: Thu Apr 28 05:08:03 EDT 2016
Merge pull request #2436 from saamas/processing-add-avx2-vaa-routines [Processing] Add AVX2 VAA routines
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -64,6 +64,13 @@
sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
}
+ if (iCpuFlag & WELS_CPU_AVX2) {
+ sVaaFuncs.pfVAACalcSad = VAACalcSad_avx2;
+ sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_avx2;
+ sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_avx2;
+ sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_avx2;
+ sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_avx2;
+ }
#endif//X86_ASM
#ifdef HAVE_NEON
if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
--- a/codec/processing/src/vaacalc/vaacalculation.h
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -104,6 +104,11 @@
VAACalcSadFunc VAACalcSad_sse2;
VAACalcSadVarFunc VAACalcSadVar_sse2;
VAACalcSadSsdFunc VAACalcSadSsd_sse2;
+VAACalcSadBgdFunc VAACalcSadBgd_avx2;
+VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_avx2;
+VAACalcSadFunc VAACalcSad_avx2;
+VAACalcSadVarFunc VAACalcSadVar_avx2;
+VAACalcSadSsdFunc VAACalcSadSsd_avx2;
WELSVP_EXTERN_C_END
#endif
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -2028,3 +2028,1532 @@
%undef localsize
ret
%endif
+
+%ifdef X86_32
+%define ptrword dword
+%else
+%define ptrword qword
+%endif
+
+%define xmm_width 16
+%define ymm_width 32
+
+%macro PUSHM 1-*
+ %rep %0
+ push %1
+ %rotate 1
+ %endrep
+ %assign push_num push_num + %0
+%endmacro
+
+%macro POPM 1-*
+ %rep %0
+ %rotate -1
+ pop %1
+ %endrep
+ %assign push_num push_num - %0
+%endmacro
+
+%ifdef X86_32
+%define stack_alloc_min 4
+%else
+%define stack_alloc_min 8
+%endif
+
+; Allocate aligned stack space.
+; address_out=%1 size=%2 alignment=%3
+%macro STACK_ALLOC 3
+%if (%3) & ((%3) - 1)
+ %error non-power-of-2 alignment requested.
+%endif
+%if (%3) > 0
+ %assign stack_alloc_align ((%3) + stack_alloc_min - 1) / stack_alloc_min
+%else
+ %assign stack_alloc_align 1
+%endif
+ %assign stack_alloc_num ((%2) + stack_alloc_min - 1) / stack_alloc_min + stack_alloc_align - 1
+ %assign push_num push_num + stack_alloc_num
+ sub r7, stack_alloc_min * stack_alloc_num
+%if stack_alloc_align == 1
+ mov %1, r7
+%else
+ lea %1, [r7 + stack_alloc_min * (stack_alloc_align - 1)]
+ and %1, -(stack_alloc_min * stack_alloc_align)
+%endif
+%endmacro
+
+; Deallocate stack space allocated with STACK_ALLOC.
+%macro STACK_DEALLOC 0
+ add r7, stack_alloc_min * stack_alloc_num
+ %assign push_num push_num - stack_alloc_num
+%endmacro
+
+; Max unsigned byte per quadword
+; out=%1 in=%2 tmp=%3
+%macro AVX2_Maxubq 3
+ vpsrlq %3, %2, 32
+ vpmaxub %1, %2, %3
+ vpsrlq %3, %1, 16
+ vpmaxub %1, %1, %3
+ vpsrlq %3, %1, 8
+ vpmaxub %1, %1, %3
+%endmacro
+
+; Max unsigned byte per quadword. 2 register input.
+; Results interleaved as least significant byte of even/odd doublewords.
+; out=%1 in_a=%2 in_b=%3 tmp=%4
+%macro AVX2_Maxubq2 4
+ vpblendd %4, %2, %3, 10101010b
+ vpshufd %4, %4, 10110001b
+ vpblendd %1, %2, %3, 01010101b
+ vpmaxub %1, %4, %1
+ vpsrld %4, %1, 16
+ vpmaxub %1, %1, %4
+ vpsrld %4, %1, 8
+ vpmaxub %1, %1, %4
+%endmacro
+
+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
+%macro AVX2_Sqsumbdw 5
+ vpunpcklbw %4, %2, %3
+%if %5
+ vpmaddwd %4, %4, %4
+ vpaddd %1, %1, %4
+%else
+ vpmaddwd %1, %4, %4
+%endif
+ vpunpckhbw %4, %2, %3
+ vpmaddwd %4, %4, %4
+ vpaddd %1, %1, %4
+%endmacro
+
+; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5
+%macro AVX2_Sumbdw 5
+%if %5
+ vpsadbw %4, %2, %3
+ vpaddd %1, %1, %4
+%else
+ vpsadbw %1, %2, %3
+%endif
+%endmacro
+
+; res=%1 a=%2 b=%3 a=%4 tmp=%5
+%macro AVX2_AbsDiffub 5
+ vpsubusb %5, %2, %3
+ vpsubusb %1, %3, %4
+ vpor %1, %5, %1
+%endmacro
+
+; sad=%1 cur_data=%2 ref_data=%3 tmp=%4 accumulate_results=%5
+%macro AVX2_Sadbdw 5
+%if %5
+ vpsadbw %4, %2, %3
+ vpaddd %1, %1, %4
+%else
+ vpsadbw %1, %2, %3
+%endif
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 cur_data=%4 ref_data=%5 zero=%6 tmp=%7 accumulate_results=%8
+%macro AVX2_SadSumSqsumbdw 8
+ AVX2_Sadbdw %1, %4, %5, %7, %8
+ AVX2_Sumbdw %2, %4, %6, %7, %8
+ AVX2_Sqsumbdw %3, %4, %6, %7, %8
+%endmacro
+
+; sad=%1 pCur=%2 pRef=%3 tmp=%4 accumulate_results=%5
+%macro AVX2_Sad 5
+ vmovdqu %4, [%2]
+ AVX2_Sadbdw %1, %4, [%3], %4, %5
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 pCur=%4 pRef=%5 zero=%6 tmp=%7,%8 accumulate_results=%9
+%macro AVX2_SadSumSqsum 9
+ vmovdqu %7, [%4]
+ AVX2_SadSumSqsumbdw %1, %2, %3, %7, [%5], %6, %8, %9
+%endmacro
+
+; sad=%1 sum_cur=%2 sqsum_cur=%3 sqdiff=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
+%macro AVX2_SadSumSqsumSqdiff 11
+ vmovdqu %8, [%5]
+ vmovdqu %9, [%6]
+ AVX2_SadSumSqsumbdw %1, %2, %3, %8, %9, %7, %10, %11
+ AVX2_AbsDiffub %9, %8, %9, %8, %10
+ AVX2_Sqsumbdw %4, %9, %7, %10, %11
+%endmacro
+
+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11
+%macro AVX2_SadSdMad 11
+ vmovdqu %8, [%5]
+ vmovdqu %9, [%6]
+ AVX2_Sumbdw %2, %8, %7, %10, %11
+ AVX2_Sumbdw %3, %9, %7, %10, %11
+ AVX2_Sadbdw %1, %8, %9, %10, %11
+%if %11
+ AVX2_AbsDiffub %9, %8, %9, %8, %10
+ vpmaxub %4, %4, %9
+%else
+ AVX2_AbsDiffub %4, %8, %9, %8, %10
+%endif
+%endmacro
+
+; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 sqdiff=%5 sqsum_cur=%6 pCur=%7 pRef=%8 zero=%9 tmp=%10,%11,%12 accumulate_results=%13
+%macro AVX2_SadBgdSqdiff 13
+%ifidn %12, 0
+ vmovdqu %10, [%7]
+ AVX2_Sumbdw %2, %10, %9, %11, %13
+ AVX2_Sqsumbdw %6, %10, %9, %11, %13
+ vmovdqu %11, [%8]
+ AVX2_Sadbdw %1, %10, %11, %10, %13
+ AVX2_Sumbdw %3, %11, %9, %10, %13
+ vmovdqu %10, [%7]
+%if %13
+ AVX2_AbsDiffub %11, %10, %11, [%7], %10
+ vpmaxub %4, %4, %11
+ AVX2_Sqsumbdw %5, %11, %9, %10, %13
+%else
+ AVX2_AbsDiffub %4, %10, %11, [%7], %10
+ AVX2_Sqsumbdw %5, %4, %9, %10, %13
+%endif
+%else
+ vmovdqu %10, [%7]
+ vmovdqu %11, [%8]
+ AVX2_Sadbdw %1, %10, %11, %12, %13
+ AVX2_Sumbdw %2, %10, %9, %12, %13
+ AVX2_Sumbdw %3, %11, %9, %12, %13
+ AVX2_Sqsumbdw %6, %10, %9, %12, %13
+%if %13
+ AVX2_AbsDiffub %11, %10, %11, %10, %12
+ vpmaxub %4, %4, %11
+ AVX2_Sqsumbdw %5, %11, %9, %10, %13
+%else
+ AVX2_AbsDiffub %4, %10, %11, %10, %12
+ AVX2_Sqsumbdw %5, %4, %9, %10, %13
+%endif
+%endif
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
+%macro AVX2_Store8x8Accdw 5
+ vpshufd %2%4, %2%3, 1000b
+%ifidni %2, x
+ vmovlps [%1 + 8 * %5], x%4
+%elif %5 == 0
+ vmovdqu [%1], %2%4
+%else
+ vmovlps [%1 + 8], x%4
+ vextracti128 x%4, %2%4, 1
+ vmovlps [%1 + 24], x%4
+%endif
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5
+%macro AVX2_Store8x8Accb 5
+ vpunpckhqdq %2%4, %2%3, %2%3
+ vpunpcklbw %2%4, %2%3, %2%4
+%if %5 == 0
+ vmovd [%1 + 0], x%4
+%ifidni %2, y
+ vextracti128 x%4, %2%4, 1
+ vmovd [%1 + 4], x%4
+%endif
+%else
+ vpextrw [%1 + 2], x%4, 0
+%ifidni %2, y
+ vextracti128 x%4, %2%4, 1
+ vpextrw [%1 + 6], x%4, 0
+%endif
+%endif
+%endmacro
+
+; p_dst=%1 data=%2 tmp=%3,%4 second_blocks=%5
+%macro AVX2_Store2x8x8Accb 5
+ vpunpckhqdq y%3, y%2, y%2
+ vpunpcklbw y%3, y%2, y%3
+ vextracti128 x%4, y%3, 1
+ vpsllq x%4, x%4, 32
+ vpblendd x%4, x%3, x%4, 1010b
+%if %5
+ vpslld x%4, x%4, 16
+ vpblendw x%4, x%4, [%1], 01010101b
+%endif
+ vmovdqu [%1], x%4
+%endmacro
+
+; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 add_to_dst=%5
+%macro AVX2_Store16x16Accdw 5
+%ifidni %2, x
+%if %5
+ vmovd x%4, [%1 + 0]
+ vpaddd x%3, x%4, x%3
+%endif
+ vmovd [%1 + 0], x%3
+%elif %5 == 0
+ vmovd [%1 + 0], x%3
+ vextracti128 x%3, %2%3, 1
+ vmovd [%1 + 4], x%3
+%else
+ vextracti128 x%4, %2%3, 1
+ vpunpckldq x%4, x%3, x%4
+ vmovq x%3, [%1 + 0]
+ vpaddd x%3, x%3, x%4
+ vmovlps [%1 + 0], x%3
+%endif
+%endmacro
+
+; p_dst1=%1 p_dst2=%2 i_dst_offset=%3 gpr_tmp=%4 mmreg_prefix=%5 data=%6 mm_tmp=%7 add_to_dst=%8
+%macro AVX2_Store2x16x16Accdw 8
+%ifidni %5, x
+ mov %4, %1
+%if %8 == 0
+ vmovd [%4 + %3], x%6
+ mov %4, %2
+ vpextrd [%4 + %3], x%6, 2
+%else
+ vmovd x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vmovd [%4 + %3], x%7
+ mov %4, %2
+ vpbroadcastd x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vpextrd [%4 + %3], x%7, 2
+%endif
+%else
+ vextracti128 x%7, %5%6, 1
+ vpblendd x%6, x%6, x%7, 1010b
+ mov %4, %1
+%if %8 == 0
+ vmovlps [%4 + %3], x%6
+ mov %4, %2
+ vmovhps [%4 + %3], x%6
+%else
+ vmovq x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vmovlps [%4 + %3], x%7
+ mov %4, %2
+ vpbroadcastq x%7, [%4 + %3]
+ vpaddd x%7, x%7, x%6
+ vmovhps [%4 + %3], x%7
+%endif
+%endif
+%endmacro
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
+%macro AVX2_CalcSad_8Lines 7
+%define mm_tmp0 %2
+%define mm_sad %3
+%define mm_sad2 %4
+%define mm_sad3 %5
+%define mm_sad4 %6
+%define b_second_blocks %7
+%ifdef i_stride5
+ %define i_stride5_ i_stride5
+%else
+ lea r_tmp, [5 * i_stride]
+ %define i_stride5_ r_tmp
+%endif
+ ; Use multiple accumulators to shorten dependency chains and enable more parallelism.
+ AVX2_Sad %1 %+ mm_sad, p_cur, p_ref, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_tmp0, 0
+ AVX2_Sad %1 %+ mm_sad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_tmp0, 1
+ AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_tmp0, 1
+%ifdef i_stride7
+ %define i_stride7_ i_stride7
+%else
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ %define i_stride7_ r_tmp
+%endif
+ AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_tmp0, 1
+ AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride7_, p_ref + 1 * i_stride7_, %1 %+ mm_tmp0, 1
+%undef i_stride5_
+%undef i_stride7_
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+ ; Collapse accumulators.
+ vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad2
+ vpaddd %1 %+ mm_sad3, %1 %+ mm_sad3, %1 %+ mm_sad4
+ vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad3
+ AVX2_Store8x8Accdw p_sad8x8 + xcnt_unit * i_xcnt, %1, mm_sad, mm_tmp0, b_second_blocks
+ vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
+%undef mm_tmp0
+%undef mm_sad
+%undef mm_sad2
+%undef mm_sad3
+%undef mm_sad4
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSad_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSad_avx2
+%define p_sadframe ptrword arg6
+%define p_sad8x8 ptrword arg7
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+%define mm_sadframe mm6
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+ vmovdqa y %+ mm_sadframe, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ mov r5, p_sad8x8
+ %define i_stride3 r3
+ %undef p_sad8x8
+ %define p_sad8x8 r5
+ %define r_tmp r6
+ lea i_stride3, [3 * i_stride]
+%else
+ mov rbp, p_sad8x8
+ %define i_stride3 rbx
+ %define i_stride5 r12
+ %define i_stride7 r6
+ %undef p_sad8x8
+ %define p_sad8x8 rbp
+ lea i_stride3, [3 * i_stride]
+ lea i_stride5, [5 * i_stride]
+ lea i_stride7, [i_stride + 2 * i_stride3]
+%endif
+
+ ; offset pointer so as to compensate for the i_xcnt offset below.
+ sub p_sad8x8, 4 * 16 / xcnt_unit
+
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+
+.height_loop:
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea p_sad8x8, [p_sad8x8 + xcnt_unit * i_xcnt]
+ ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_upper8_remaining16
+.width_loop_upper8:
+ AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+ AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ xor i_xcnt, i_xcnt
+ sub i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_lower8_remaining16
+.width_loop_lower8:
+ AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+ AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ xor i_xcnt, i_xcnt
+ sub i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ neg i_xcnt
+ sub i_ycnt, 1
+ jnz .height_loop
+
+ pop i_xcnt
+%assign push_num push_num - 1
+%undef i_xcnt_load
+
+.done:
+ mov r6, p_sadframe
+ vextracti128 xmm2, y %+ mm_sadframe, 1
+ vpaddd xmm2, x %+ mm_sadframe, xmm2
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r6], xmm2
+ vzeroupper
+
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef r_tmp
+%undef xcnt_unit
+%undef i_stride3
+%undef i_stride5
+%undef i_stride7
+%undef mm_sadframe
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7
+%macro AVX2_CalcSadVar_8Lines 7
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_sad %4
+%define mm_sum %5
+%define mm_sqsum %6
+%define b_second_blocks %7
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+ lea r_tmp, [5 * i_stride]
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 0
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+ vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ AVX2_Store8x8Accdw p_sad8x8 + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+ vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad
+ vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_sqsum
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sqsum
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ AVX2_Store2x16x16Accdw p_sum16x16, p_sqsum16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_sad
+%undef mm_sum
+%undef mm_sqsum
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadVar_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadVar_avx2
+%define p_sadframe ptrword arg6
+%define p_sad8x8 ptrword arg7
+%define p_sum16x16 ptrword arg8
+%define p_sqsum16x16 ptrword arg9
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+%define mm_sadframe mm6
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+ vmovdqa y %+ mm_sadframe, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ mov r3, p_sad8x8
+ %undef p_sad8x8
+ %define p_sad8x8 r3
+ %define i_stride3 r5
+%else
+ mov rbp, p_sad8x8
+ mov r12, p_sum16x16
+ mov r13, p_sqsum16x16
+ %undef p_sad8x8
+ %undef p_sum16x16
+ %undef p_sqsum16x16
+ %define p_sad8x8 rbp
+ %define p_sum16x16 r12
+ %define p_sqsum16x16 r13
+ %define i_stride3 rbx
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers so as to compensate for the i_xcnt offset below.
+ sub p_sad8x8, 4 * 16 / xcnt_unit
+ sub p_sum16x16, 1 * 16 / xcnt_unit
+ sub p_sqsum16x16, 1 * 16 / xcnt_unit
+
+ ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_sad8x8, r_tmp
+ sub p_sum16x16, i_xcnt
+ sub p_sqsum16x16, i_xcnt
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_upper8_remaining16
+.width_loop_upper8:
+ AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+ AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_lower8_remaining16
+.width_loop_lower8:
+ AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+ AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+%undef i_xcnt_load
+ pop i_xcnt
+ %assign push_num push_num - 1
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+ vextracti128 xmm2, y %+ mm_sadframe, 1
+ vpaddd xmm2, x %+ mm_sadframe, xmm2
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef r_tmp
+%undef xcnt_unit
+%undef mm_sadframe
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sum16x16
+%undef p_sqsum16x16
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
+%macro AVX2_CalcSadSsd_8Lines 9
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_tmp2 %4
+%define mm_sad %5
+%define mm_sum %6
+%define mm_sqsum %7
+%define mm_sqdiff %8
+%define b_second_blocks %9
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+%ifdef i_stride5
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ %define i_stride5_ i_stride5
+%else
+ lea r_tmp, [5 * i_stride]
+ %define i_stride5_ r_tmp
+%endif
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+%ifndef i_stride5
+ lea r_tmp, [i_stride + 2 * i_stride3]
+%endif
+%undef i_stride5_
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+ vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+ vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ mov r_tmp, p_sad8x8
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+ vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+ vmovdqa sadframe_acc, y %+ mm_tmp1
+%else
+ vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+ mov r_tmp, i_xcnt
+ add r_tmp, p_sum16x16
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
+ AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+ vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_tmp2
+%undef mm_sad
+%undef mm_sum
+%undef mm_sqsum
+%undef mm_sqdiff
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadSsd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadSsd_avx2
+%define p_sadframe ptrword arg6
+%define p_sad8x8 ptrword arg7
+%define p_sum16x16 ptrword arg8
+%define p_sqsum16x16 ptrword arg9
+%define p_sqdiff16x16 ptrword arg10
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13, r14, r15
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 9
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+
+%ifdef X86_32
+ STACK_ALLOC r5, ymm_width, ymm_width
+ %define sadframe_acc_addr r5
+ %define sadframe_acc [sadframe_acc_addr]
+%else
+ %define sadframe_acc ymm8
+ %define xsadframe_acc xmm8
+%endif
+ vmovdqa sadframe_acc, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ %define i_stride3 r3
+%else
+ mov r12, p_sad8x8
+ mov r13, p_sum16x16
+ mov r14, p_sqsum16x16
+ mov r15, p_sqdiff16x16
+ %undef p_sad8x8
+ %undef p_sum16x16
+ %undef p_sqsum16x16
+ %undef p_sqdiff16x16
+ %define p_sad8x8 r12
+ %define p_sum16x16 r13
+ %define p_sqsum16x16 r14
+ %define p_sqdiff16x16 r15
+ %define i_stride3 rbx
+ %define i_stride5 rbp
+ lea i_stride5, [5 * i_stride]
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers so as to compensate for i_xcnt offset below.
+ sub p_sad8x8, 4 * 16 / xcnt_unit
+ sub p_sum16x16, 1 * 16 / xcnt_unit
+ sub p_sqsum16x16, 1 * 16 / xcnt_unit
+ sub p_sqdiff16x16, 1 * 16 / xcnt_unit
+
+ ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_sad8x8, r_tmp
+ sub p_sum16x16, i_xcnt
+ sub p_sqsum16x16, i_xcnt
+ sub p_sqdiff16x16, i_xcnt
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_upper8_remaining16
+.width_loop_upper8:
+ AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_end
+.width_loop_upper8_remaining16:
+ AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ add i_xcnt, 16 / xcnt_unit
+ jz .width_loop_lower8_remaining16
+.width_loop_lower8:
+ AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_end
+.width_loop_lower8_remaining16:
+ AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+%undef i_xcnt_load
+ pop i_xcnt
+ %assign push_num push_num - 1
+ lea p_cur, [p_cur + xcnt_unit * i_xcnt]
+ lea p_ref, [p_ref + xcnt_unit * i_xcnt]
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+%ifdef X86_32
+ vmovdqa xmm2, sadframe_acc
+ vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+ vextracti128 xmm2, sadframe_acc, 1
+ vpaddd xmm2, xsadframe_acc, xmm2
+%endif
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+%ifdef X86_32
+ STACK_DEALLOC
+%endif
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef i_stride5
+%undef r_tmp
+%undef xcnt_unit
+%undef sadframe_acc
+%undef sadframe_acc_addr
+%undef xsadframe_acc
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sum16x16
+%undef p_sqsum16x16
+%undef p_sqdiff16x16
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9
+%macro AVX2_CalcSadBgd_8Lines 9
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_tmp2 %8
+%define mm_mad %4
+%define mm_sumcur %5
+%define mm_sumref %6
+%define mm_sad %7
+%define b_second_blocks %9
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+ lea r_tmp, [5 * i_stride]
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sumcur, x %+ mm_sumcur, x %+ mm_sumcur
+ vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
+ vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ mov r_tmp, p_sad8x8
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+ vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+ vmovdqa sadframe_acc, y %+ mm_tmp1
+%else
+ vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+ mov r_tmp, p_sd8x8
+ vpsubd %1 %+ mm_tmp0, %1 %+ mm_sumcur, %1 %+ mm_sumref
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_tmp0, mm_tmp1, b_second_blocks
+ ; Coalesce store and horizontal reduction of MAD accumulator for even and
+ ; odd iterations so as to enable more parallelism.
+%ifidni %1, y
+ test i_xcnt, 32 / xcnt_unit
+ jz %%preserve_mad
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
+ AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
+%%preserve_mad:
+ vmovdqa prev_mad, y %+ mm_mad
+%else
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
+ AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
+%endif
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_tmp2
+%undef mm_mad
+%undef mm_sumcur
+%undef mm_sumref
+%undef mm_sad
+%undef b_second_blocks
+%endmacro
+
+; Store remaining MAD accumulator for width & 32 cases.
+; width/xcnt_unit=%1 mm_tmp=%2,%3 b_second_blocks=%4
+%macro AVX2_StoreRemainingSingleMad 4
+ test %1, 32 / xcnt_unit
+ jz %%skip
+ mov r_tmp, p_mad8x8
+ vmovdqa y%2, prev_mad
+ AVX2_Maxubq y%2, y%2, y%3
+ AVX2_Store8x8Accb r_tmp + i_xcnt - 8, y, %2, %3, %4
+%%skip:
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadBgd_avx2
+%define p_sadframe arg6
+%define p_sad8x8 arg7
+%define p_sd8x8 arg8
+%define p_mad8x8 arg9
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 10
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%define mm_zero mm0
+ vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero
+
+%ifdef X86_32
+ STACK_ALLOC r5, 2 * ymm_width, ymm_width
+ %define sadframe_acc_addr r5
+ %define sadframe_acc [sadframe_acc_addr]
+ %define prev_mad [r5 + ymm_width]
+%else
+ %define sadframe_acc ymm8
+ %define xsadframe_acc xmm8
+ %define prev_mad ymm9
+%endif
+ vmovdqa sadframe_acc, y %+ mm_zero
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ %define i_stride3 r3
+%else
+ mov rbp, p_sad8x8
+ mov r12, p_sd8x8
+ mov r13, p_mad8x8
+ %undef p_sad8x8
+ %undef p_sd8x8
+ %undef p_mad8x8
+ %define p_sad8x8 rbp
+ %define p_sd8x8 r12
+ %define p_mad8x8 r13
+ %define i_stride3 rbx
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers to compensate for the i_xcnt offset below.
+ mov r_tmp, i_xcnt
+ and r_tmp, 64 / xcnt_unit - 1
+ sub p_mad8x8, r_tmp
+ shl r_tmp, 2
+ sub p_sad8x8, r_tmp
+ sub p_sd8x8, r_tmp
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ add p_sad8x8, r_tmp
+ add p_sd8x8, r_tmp
+ add p_mad8x8, i_xcnt
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_upper8_64x_end
+ ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+.width_loop_upper8:
+ AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_32x_end
+.width_loop_upper8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_upper8
+.width_loop_upper8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_upper8_end
+ ; remaining 16.
+ AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_lower8_64x_end
+ neg i_xcnt
+.width_loop_lower8:
+ AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_32x_end
+.width_loop_lower8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_lower8
+.width_loop_lower8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_lower8_end
+ ; remaining 16.
+ AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ pop i_xcnt
+%undef i_xcnt_load
+ %assign push_num push_num - 1
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+%ifdef X86_32
+ vmovdqa xmm2, sadframe_acc
+ vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+ vextracti128 xmm2, sadframe_acc, 1
+ vpaddd xmm2, xsadframe_acc, xmm2
+%endif
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+%ifdef X86_32
+ STACK_DEALLOC
+%endif
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef r_tmp
+%undef xcnt_unit
+%undef sadframe_acc
+%undef sadframe_acc_addr
+%undef xsadframe_acc
+%undef prev_mad
+%undef mm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+ ret
+
+
+; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8,%9,%10 b_second_blocks=%11
+%macro AVX2_CalcSadSsdBgd_8Lines 11
+%define mm_tmp0 %2
+%define mm_tmp1 %3
+%define mm_sad %4
+%define mm_sum %5
+%define mm_sumref %6
+%define mm_mad %7
+%define mm_sqsum %8
+%define mm_sqdiff %9
+%ifidn %10, 0
+%define tmp2 0
+%else
+%define tmp2 %1 %+ %10
+%endif
+%define b_second_blocks %11
+ ; Unroll for better performance on Haswell.
+ ; Avoid unrolling for the 16 px case so as to reduce the code footprint.
+%ifidni %1, y
+ lea r_tmp, [5 * i_stride]
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 0
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ lea r_tmp, [i_stride + 2 * i_stride3]
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ add p_cur, %1 %+ mm_width
+ add p_ref, %1 %+ mm_width
+%else
+ vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad
+ vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum
+ vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref
+ vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad
+ vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum
+ vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff
+ lea r_tmp, [8 * i_stride]
+ add p_cur, r_tmp
+ add p_ref, r_tmp
+ neg r_tmp
+%%loop:
+ AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1
+ add r_tmp, i_stride
+ jl %%loop
+ ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell.
+ lea r_tmp, [8 * i_stride - %1 %+ mm_width]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+%endif
+ mov r_tmp, p_sad8x8
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks
+%ifdef X86_32
+ vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc
+ vmovdqa sadframe_acc, y %+ mm_tmp1
+%else
+ vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad
+%endif
+ mov r_tmp, i_xcnt
+ add r_tmp, p_sum16x16
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1
+ AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+ mov r_tmp, p_sd8x8
+ vpsubd %1 %+ mm_sum, %1 %+ mm_sum, %1 %+ mm_sumref
+ AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sum, mm_tmp0, b_second_blocks
+ ; Coalesce store and horizontal reduction of MAD accumulator for even and
+ ; odd iterations so as to enable more parallelism.
+%ifidni %1, y
+ test i_xcnt, 32 / xcnt_unit
+ jz %%preserve_mad
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0
+ AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks
+%%preserve_mad:
+ vmovdqa prev_mad, y %+ mm_mad
+%else
+ mov r_tmp, p_mad8x8
+ AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0
+ AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks
+%endif
+ vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b
+ vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1
+ AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks
+%undef mm_tmp0
+%undef mm_tmp1
+%undef mm_sqsum
+%undef mm_sqdiff
+%undef mm_mad
+%undef mm_sum
+%undef mm_sumref
+%undef mm_sad
+%undef tmp2
+%undef b_second_blocks
+%endmacro
+
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+WELS_EXTERN VAACalcSadSsdBgd_avx2
+%define p_sadframe arg6
+%define p_sad8x8 arg7
+%define p_sum16x16 arg8
+%define p_sqsum16x16 arg9
+%define p_sqdiff16x16 arg10
+%define p_sd8x8 arg11
+%define p_mad8x8 arg12
+%ifdef X86_32
+%define saveregs r5, r6
+%else
+%define saveregs rbx, rbp, r12, r13, r14, r15
+%endif
+
+%assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 12
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ PUSHM saveregs
+
+%ifdef X86_32
+ STACK_ALLOC r5, 3 * ymm_width, ymm_width
+ %define mm8 0
+ %define sadframe_acc_addr r5
+ %define sadframe_acc [sadframe_acc_addr]
+ %define prev_mad [r5 + ymm_width]
+ %define ymm_zero [r5 + 2 * ymm_width]
+ %define xmm_zero ymm_zero
+ vpxor xmm0, xmm0, xmm0
+ vmovdqa sadframe_acc, ymm0
+ vmovdqa ymm_zero, ymm0
+%else
+ %define sadframe_acc ymm9
+ %define xsadframe_acc xmm9
+ %define prev_mad ymm10
+ %define ymm_zero ymm11
+ %define xmm_zero xmm11
+ vpxor xmm_zero, xmm_zero, xmm_zero
+ vpxor xsadframe_acc, xsadframe_acc, xsadframe_acc
+%endif
+
+ and r2, -16 ; iPicWidth &= -16
+ jle .done ; bail if iPicWidth < 16
+ sar r3, 4 ; iPicHeight / 16
+ jle .done ; bail if iPicHeight < 16
+ shr r2, 2 ; iPicWidth / 4
+
+%define p_cur r0
+%define p_ref r1
+%define i_xcnt r2
+%define i_ycnt ptrword arg4
+%define i_stride r4
+%define r_tmp r6
+%define xcnt_unit 4
+%ifdef X86_32
+ mov i_ycnt, r3
+ %define i_stride3 r3
+%else
+ mov rbp, p_sad8x8
+ mov r12, p_sum16x16
+ mov r13, p_sqsum16x16
+ mov r14, p_sqdiff16x16
+ mov r15, p_sd8x8
+ %undef p_sad8x8
+ %undef p_sum16x16
+ %undef p_sqsum16x16
+ %undef p_sqdiff16x16
+ %undef p_sd8x8
+ %define p_sad8x8 rbp
+ %define p_sum16x16 r12
+ %define p_sqsum16x16 r13
+ %define p_sqdiff16x16 r14
+ %define p_sd8x8 r15
+ %define i_stride3 rbx
+%endif
+ lea i_stride3, [3 * i_stride]
+
+ ; offset pointers so as to compensate for the i_xcnt offset below.
+ mov r_tmp, i_xcnt
+ and r_tmp, 64 / xcnt_unit - 1
+ sub p_sum16x16, r_tmp
+ sub p_sqsum16x16, r_tmp
+ sub p_sqdiff16x16, r_tmp
+ sub p_mad8x8, r_tmp
+ shl r_tmp, 2
+ sub p_sad8x8, r_tmp
+ sub p_sd8x8, r_tmp
+
+.height_loop:
+ push i_xcnt
+%assign push_num push_num + 1
+%define i_xcnt_load ptrword [r7]
+ ; use end-of-line pointers so as to enable use of a negative counter as index.
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ add p_sad8x8, r_tmp
+ add p_sum16x16, i_xcnt
+ add p_sqsum16x16, i_xcnt
+ add p_sqdiff16x16, i_xcnt
+ add p_sd8x8, r_tmp
+ add p_mad8x8, i_xcnt
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_upper8_64x_end
+ ; use a negative loop counter to enable counting toward zero and indexing with the same counter.
+ neg i_xcnt
+.width_loop_upper8:
+ AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_upper8
+ jg .width_loop_upper8_32x_end
+.width_loop_upper8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_upper8
+.width_loop_upper8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_upper8_end
+ ; remaining 16.
+ AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0
+.width_loop_upper8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ mov i_xcnt, i_xcnt_load
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ and i_xcnt, -(64 / xcnt_unit)
+ jz .width_loop_lower8_64x_end
+ neg i_xcnt
+.width_loop_lower8:
+ AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
+ add i_xcnt, 32 / xcnt_unit
+ jl .width_loop_lower8
+ jg .width_loop_lower8_32x_end
+.width_loop_lower8_64x_end:
+ test i_xcnt_load, 32 / xcnt_unit
+ jnz .width_loop_lower8
+.width_loop_lower8_32x_end:
+ AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1
+ test i_xcnt_load, 16 / xcnt_unit
+ jz .width_loop_lower8_end
+ ; remaining 16.
+ AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1
+.width_loop_lower8_end:
+ lea p_cur, [p_cur + 8 * i_stride]
+ lea p_ref, [p_ref + 8 * i_stride]
+ pop i_xcnt
+%undef i_xcnt_load
+ %assign push_num push_num - 1
+ lea r_tmp, [xcnt_unit * i_xcnt]
+ sub p_cur, r_tmp
+ sub p_ref, r_tmp
+ sub i_ycnt, 1
+ jnz .height_loop
+
+.done:
+ mov r_tmp, p_sadframe
+%ifdef X86_32
+ vmovdqa xmm2, sadframe_acc
+ vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width]
+%else
+ vextracti128 xmm2, sadframe_acc, 1
+ vpaddd xmm2, xsadframe_acc, xmm2
+%endif
+ vpunpckhqdq xmm1, xmm2, xmm2
+ vpaddd xmm2, xmm2, xmm1
+ vmovd [r_tmp], xmm2
+ vzeroupper
+%ifdef X86_32
+ STACK_DEALLOC
+%endif
+ POPM saveregs
+ POP_XMM
+ LOAD_5_PARA_POP
+%undef p_cur
+%undef p_ref
+%undef i_xcnt
+%undef i_ycnt
+%undef i_stride
+%undef i_stride3
+%undef r_tmp
+%undef xcnt_unit
+%undef mm8
+%undef sadframe_acc
+%undef sadframe_acc_addr
+%undef xsadframe_acc
+%undef prev_mad
+%undef ymm_zero
+%undef xmm_zero
+%undef saveregs
+%undef p_sadframe
+%undef p_sad8x8
+%undef p_sum16x16
+%undef p_sqsum16x16
+%undef p_sqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+ ret
--- a/test/processing/ProcessUT_VaaCalc.cpp
+++ b/test/processing/ProcessUT_VaaCalc.cpp
@@ -590,20 +590,22 @@
int32_t pic_height_a; \
int32_t pic_stride_a; \
int32_t psadframe_a; \
- pic_width_c = pic_width_a = 320-16; \
- pic_height_c = pic_height_a = 320; \
- pic_stride_c = pic_stride_a = 320; \
- psadframe_c = psadframe_a = 0; \
- for (int j=0; j<BUFFER_SIZE; j++) { \
- cur_data_c[j] = cur_data_a[j] = (rand()%256); \
- ref_data_c[j] = ref_data_a[j] = (rand()%256); \
- psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ for (int i=0; i<4; i++) { \
+ pic_width_c = pic_width_a = 320-16*i; \
+ pic_height_c = pic_height_a = 320; \
+ pic_stride_c = pic_stride_a = 320; \
+ psadframe_c = psadframe_a = 0; \
+ for (int j=0; j<BUFFER_SIZE; j++) { \
+ cur_data_c[j] = cur_data_a[j] = (rand()%256); \
+ ref_data_c[j] = ref_data_a[j] = (rand()%256); \
+ psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ } \
+ VAACalcSad_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c); \
+ func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a); \
+ ASSERT_EQ (psadframe_a, psadframe_c); \
+ for (int j=0; j<(BUFFER_SIZE/64); j++) \
+ ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
} \
- VAACalcSad_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c); \
- func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a); \
- ASSERT_EQ (psadframe_a, psadframe_c); \
- for (int j=0; j<(BUFFER_SIZE/64); j++) \
- ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
}
@@ -633,25 +635,27 @@
int32_t pic_height_a; \
int32_t pic_stride_a; \
int32_t psadframe_a; \
- pic_width_c = pic_width_a = 320-16; \
- pic_height_c = pic_height_a = 320; \
- pic_stride_c = pic_stride_a = 320; \
- psadframe_c = psadframe_a = 0; \
- for (int j=0; j<BUFFER_SIZE; j++) { \
- cur_data_c[j] = cur_data_a[j] = (rand()%256); \
- ref_data_c[j] = ref_data_a[j] = (rand()%256); \
- psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
- psd8x8_c[j%(BUFFER_SIZE/64)] = psd8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
- pmad8x8_c[j%(BUFFER_SIZE/64)] = pmad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ for (int i=0; i<4; i++) { \
+ pic_width_c = pic_width_a = 320-16*i; \
+ pic_height_c = pic_height_a = 320; \
+ pic_stride_c = pic_stride_a = 320; \
+ psadframe_c = psadframe_a = 0; \
+ for (int j=0; j<BUFFER_SIZE; j++) { \
+ cur_data_c[j] = cur_data_a[j] = (rand()%256); \
+ ref_data_c[j] = ref_data_a[j] = (rand()%256); \
+ psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ psd8x8_c[j%(BUFFER_SIZE/64)] = psd8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ pmad8x8_c[j%(BUFFER_SIZE/64)] = pmad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ } \
+ VAACalcSadBgd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psd8x8_c, pmad8x8_c); \
+ func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psd8x8_a, pmad8x8_a); \
+ ASSERT_EQ (psadframe_a, psadframe_c); \
+ for (int j=0; j<(BUFFER_SIZE/64); j++) {\
+ ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
+ ASSERT_EQ (psd8x8_a[j], psd8x8_c[j]); \
+ ASSERT_EQ (pmad8x8_a[j], pmad8x8_c[j]); \
+ } \
} \
- VAACalcSadBgd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psd8x8_c, pmad8x8_c); \
- func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psd8x8_a, pmad8x8_a); \
- ASSERT_EQ (psadframe_a, psadframe_c); \
- for (int j=0; j<(BUFFER_SIZE/64); j++) {\
- ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
- ASSERT_EQ (psd8x8_a[j], psd8x8_c[j]); \
- ASSERT_EQ (pmad8x8_a[j], pmad8x8_c[j]); \
- } \
}
#define GENERATE_VAACalcSadSsd_UT(func, ASM, CPUFLAGS) \
@@ -682,29 +686,31 @@
int32_t pic_height_a; \
int32_t pic_stride_a; \
int32_t psadframe_a; \
- pic_width_c = pic_width_a = 320-16; \
- pic_height_c = pic_height_a = 320; \
- pic_stride_c = pic_stride_a = 320; \
- psadframe_c = psadframe_a = 0; \
- for (int j=0; j<BUFFER_SIZE; j++) { \
- cur_data_c[j] = cur_data_a[j] = (rand()%256); \
- ref_data_c[j] = ref_data_a[j] = (rand()%256); \
- psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
- psum16x16_c[j%(BUFFER_SIZE/256)] = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
- psqsum16x16_c[j%(BUFFER_SIZE/256)] = psqsum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
- psqdiff16x16_c[j%(BUFFER_SIZE/256)] = psqdiff16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ for (int i=0; i<4; i++) { \
+ pic_width_c = pic_width_a = 320-16*i; \
+ pic_height_c = pic_height_a = 320; \
+ pic_stride_c = pic_stride_a = 320; \
+ psadframe_c = psadframe_a = 0; \
+ for (int j=0; j<BUFFER_SIZE; j++) { \
+ cur_data_c[j] = cur_data_a[j] = (rand()%256); \
+ ref_data_c[j] = ref_data_a[j] = (rand()%256); \
+ psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ psum16x16_c[j%(BUFFER_SIZE/256)] = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ psqsum16x16_c[j%(BUFFER_SIZE/256)] = psqsum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ psqdiff16x16_c[j%(BUFFER_SIZE/256)] = psqdiff16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ } \
+ VAACalcSadSsd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c, psqdiff16x16_c); \
+ func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a, psqdiff16x16_a); \
+ ASSERT_EQ (psadframe_a, psadframe_c); \
+ for (int j=0; j<(BUFFER_SIZE/64); j++) {\
+ ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
+ } \
+ for (int j=0; j<(BUFFER_SIZE/256); j++) {\
+ ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
+ ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
+ ASSERT_EQ (psqdiff16x16_a[j], psqdiff16x16_c[j]); \
+ } \
} \
- VAACalcSadSsd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c, psqdiff16x16_c); \
- func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a, psqdiff16x16_a); \
- ASSERT_EQ (psadframe_a, psadframe_c); \
- for (int j=0; j<(BUFFER_SIZE/64); j++) {\
- ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
- } \
- for (int j=0; j<(BUFFER_SIZE/256); j++) {\
- ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
- ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
- ASSERT_EQ (psqdiff16x16_a[j], psqdiff16x16_c[j]); \
- } \
}
#define GENERATE_VAACalcSadVar_UT(func, ASM, CPUFLAGS) \
@@ -733,27 +739,29 @@
int32_t pic_height_a; \
int32_t pic_stride_a; \
int32_t psadframe_a; \
- pic_width_c = pic_width_a = 320-16; \
- pic_height_c = pic_height_a = 320; \
- pic_stride_c = pic_stride_a = 320; \
- psadframe_c = psadframe_a = 0; \
- for (int j=0; j<BUFFER_SIZE; j++) { \
- cur_data_c[j] = cur_data_a[j] = (rand()%256); \
- ref_data_c[j] = ref_data_a[j] = (rand()%256); \
- psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
- psum16x16_c[j%(BUFFER_SIZE/256)] = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
- psqsum16x16_c[j%(BUFFER_SIZE/256)] = psqsum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ for (int i=0; i<4; i++) { \
+ pic_width_c = pic_width_a = 320-16*i; \
+ pic_height_c = pic_height_a = 320; \
+ pic_stride_c = pic_stride_a = 320; \
+ psadframe_c = psadframe_a = 0; \
+ for (int j=0; j<BUFFER_SIZE; j++) { \
+ cur_data_c[j] = cur_data_a[j] = (rand()%256); \
+ ref_data_c[j] = ref_data_a[j] = (rand()%256); \
+ psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ psum16x16_c[j%(BUFFER_SIZE/256)] = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ psqsum16x16_c[j%(BUFFER_SIZE/256)] = psqsum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ } \
+ VAACalcSadVar_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c); \
+ func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a); \
+ ASSERT_EQ (psadframe_a, psadframe_c); \
+ for (int j=0; j<(BUFFER_SIZE/64); j++) {\
+ ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
+ } \
+ for (int j=0; j<(BUFFER_SIZE/256); j++) {\
+ ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
+ ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
+ } \
} \
- VAACalcSadVar_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c); \
- func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a); \
- ASSERT_EQ (psadframe_a, psadframe_c); \
- for (int j=0; j<(BUFFER_SIZE/64); j++) {\
- ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
- } \
- for (int j=0; j<(BUFFER_SIZE/256); j++) {\
- ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
- ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
- } \
}
#define GENERATE_VAACalcSadSsdBgd_UT(func, ASM, CPUFLAGS) \
@@ -788,33 +796,35 @@
int32_t pic_height_a; \
int32_t pic_stride_a; \
int32_t psadframe_a; \
- pic_width_c = pic_width_a = 320-16; \
- pic_height_c = pic_height_a = 320; \
- pic_stride_c = pic_stride_a = 320; \
- psadframe_c = psadframe_a = 0; \
- for (int j=0; j<BUFFER_SIZE; j++) { \
- cur_data_c[j] = cur_data_a[j] = (rand()%256); \
- ref_data_c[j] = ref_data_a[j] = (rand()%256); \
- psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
- psd8x8_c[j%(BUFFER_SIZE/64)] = psd8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
- pmad8x8_c[j%(BUFFER_SIZE/64)] = pmad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
- psum16x16_c[j%(BUFFER_SIZE/256)] = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
- psqsum16x16_c[j%(BUFFER_SIZE/256)] = psqsum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
- psqdiff16x16_c[j%(BUFFER_SIZE/256)] = psqdiff16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ for (int i=0; i<4; i++) { \
+ pic_width_c = pic_width_a = 320-16*i; \
+ pic_height_c = pic_height_a = 320; \
+ pic_stride_c = pic_stride_a = 320; \
+ psadframe_c = psadframe_a = 0; \
+ for (int j=0; j<BUFFER_SIZE; j++) { \
+ cur_data_c[j] = cur_data_a[j] = (rand()%256); \
+ ref_data_c[j] = ref_data_a[j] = (rand()%256); \
+ psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ psd8x8_c[j%(BUFFER_SIZE/64)] = psd8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ pmad8x8_c[j%(BUFFER_SIZE/64)] = pmad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
+ psum16x16_c[j%(BUFFER_SIZE/256)] = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ psqsum16x16_c[j%(BUFFER_SIZE/256)] = psqsum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ psqdiff16x16_c[j%(BUFFER_SIZE/256)] = psqdiff16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
+ } \
+ VAACalcSadSsdBgd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c, psqdiff16x16_c, psd8x8_c, pmad8x8_c); \
+ func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a, psqdiff16x16_a, psd8x8_a, pmad8x8_a); \
+ ASSERT_EQ (psadframe_a, psadframe_c); \
+ for (int j=0; j<(BUFFER_SIZE/64); j++) {\
+ ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
+ ASSERT_EQ (psd8x8_a[j], psd8x8_c[j]); \
+ ASSERT_EQ (pmad8x8_a[j], pmad8x8_c[j]); \
+ } \
+ for (int j=0; j<(BUFFER_SIZE/256); j++) {\
+ ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
+ ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
+ ASSERT_EQ (psqdiff16x16_a[j], psqdiff16x16_c[j]); \
+ } \
} \
- VAACalcSadSsdBgd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c, psqdiff16x16_c, psd8x8_c, pmad8x8_c); \
- func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a, psqdiff16x16_a, psd8x8_a, pmad8x8_a); \
- ASSERT_EQ (psadframe_a, psadframe_c); \
- for (int j=0; j<(BUFFER_SIZE/64); j++) {\
- ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
- ASSERT_EQ (psd8x8_a[j], psd8x8_c[j]); \
- ASSERT_EQ (pmad8x8_a[j], pmad8x8_c[j]); \
- } \
- for (int j=0; j<(BUFFER_SIZE/256); j++) {\
- ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
- ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
- ASSERT_EQ (psqdiff16x16_a[j], psqdiff16x16_c[j]); \
- } \
}
GENERATE_VAACalcSad_UT (VAACalcSad_c, 0, 0)
@@ -828,6 +838,12 @@
GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_sse2, 1, WELS_CPU_SSE2)
GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_sse2, 1, WELS_CPU_SSE2)
GENERATE_VAACalcSadVar_UT (VAACalcSadVar_sse2, 1, WELS_CPU_SSE2)
+
+GENERATE_VAACalcSad_UT (VAACalcSad_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_avx2, 1, WELS_CPU_AVX2)
+GENERATE_VAACalcSadVar_UT (VAACalcSadVar_avx2, 1, WELS_CPU_AVX2)
#endif
#if defined(HAVE_NEON)