ref: efa82922e40531bda3e3f5e02ad4cfb51b4c514a
parent: b418c9582e2278ab251a78b7bb4b725e59dab6c7
parent: af10457e02f608b79db1e2883b7780721cbf715d
author: James Zern <[email protected]>
date: Thu Aug 7 12:11:19 EDT 2014
Merge "Fix bug 806"
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -640,19 +640,9 @@
#if HAVE_AVX2
#if CONFIG_VP9_ENCODER
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_ptr[], int ref_stride,
- unsigned int *sad_array);
-void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_ptr[], int ref_stride,
- unsigned int *sad_array);
-}
const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values(
+INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
make_tuple(32, 32, sad_32x32x4d_avx2),
make_tuple(64, 64, sad_64x64x4d_avx2)));
#endif // CONFIG_VP9_ENCODER
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -653,7 +653,7 @@
specialize qw/vp9_sad4x4x8 sse4/;
add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2/;
+specialize qw/vp9_sad64x64x4d sse2 avx2/;
add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x64x4d sse2/;
@@ -668,7 +668,7 @@
specialize qw/vp9_sad16x32x4d sse2/;
add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2/;
+specialize qw/vp9_sad32x32x4d sse2 avx2/;
add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x16x4d sse2/;
--- a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
@@ -31,7 +31,7 @@
sum_ref3 = _mm256_set1_epi16(0);
for (i = 0; i < 32 ; i++) {
// load src and all refs
- src_reg = _mm256_load_si256((__m256i *)(src));
+ src_reg = _mm256_loadu_si256((__m256i *)(src));
ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
@@ -103,8 +103,8 @@
sum_ref3 = _mm256_set1_epi16(0);
for (i = 0; i < 64 ; i++) {
// load 64 bytes from src and all refs
- src_reg = _mm256_load_si256((__m256i *)(src));
- srcnext_reg = _mm256_load_si256((__m256i *)(src + 32));
+ src_reg = _mm256_loadu_si256((__m256i *)(src));
+ srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32));
ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));