ref: 0b721db543538672189cdf34fb73b7dbaa4f9415
parent: 72037944df611abccac02c225e20adc9479200db
author: Dmitry Kovalev <[email protected]>
date: Fri Aug 29 10:25:31 EDT 2014
Replacing asm 8x8 variance calculation with intrinsics. New code is 10% faster for 64-bit and 25% faster for 32-bit. Compiled using clang. Change-Id: I8ba1544c30dd6f3ca479db806384317549650dfc
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -209,193 +209,3 @@
UNSHADOW_ARGS
pop rbp
ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get8x8var_sse2) PRIVATE
-sym(vp9_get8x8var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- movq xmm1, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rdi]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- psubsw xmm1, xmm2
- paddw xmm7, xmm1
-
- pmaddwd xmm1, xmm1
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movq xmm2, QWORD PTR[rsi + rax * 2]
- movq xmm3, QWORD PTR[rdi + rdx * 2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
-
- punpckhwd xmm7, xmm0
- movdqa xmm2, xmm1
-
- paddw xmm6, xmm7
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddw xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddw xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movq rdx, xmm7
- movsx rcx, dx
-
- mov dword ptr [rax], ecx
- movd DWORD PTR [rdi], xmm1
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -51,10 +51,47 @@
return 0;
}
-unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+ for (i = 0; i < 8; i += 2) {
+ const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + i * src_stride)), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + i * ref_stride)), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + (i + 1) * src_stride)), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+
+ return 0;
+}
+
unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
@@ -110,8 +147,7 @@
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vp9_get8x8var_sse2, 8);
+ vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 6);
}