shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -125,6 +125,7 @@

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";

+  specialize qw/vp9_block_error avx2/;

   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";

   specialize qw/vp9_highbd_block_error sse2/;

--- a/vp9/encoder/x86/vp9_error_intrin_avx2.c

+++ b/vp9/encoder/x86/vp9_error_intrin_avx2.c

@@ -12,8 +12,10 @@

 #include "./vp9_rtcd.h"

 #include "vpx/vpx_integer.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"

-int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,

+int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,

                              intptr_t block_size, int64_t *ssz) {

   __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;

   __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;

@@ -29,8 +31,8 @@

   for (i = 0; i < block_size; i += 16) {

     // load 32 bytes from coeff and dqcoeff

-    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));

-    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));

+    coeff_reg = load_tran_low(coeff + i);

+    dqcoeff_reg = load_tran_low(dqcoeff + i);

     // dqcoeff - coeff

     dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);

     // madd (dqcoeff - coeff)

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -13,6 +13,7 @@

 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h

+DSP_SRCS-$(HAVE_AVX2)   += x86/bitdepth_conversion_avx2.h

 DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2.h

 # This file is included in libs.mk. Including it here would cause it to be

 # compiled into an object. Even as an empty file, this would create an

--- /dev/null

+++ b/vpx_dsp/x86/bitdepth_conversion_avx2.h

@@ -1,0 +1,30 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_

+#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_

+#include <immintrin.h>

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+// Load 16 16 bit values. If the source is 32 bits then pack down with

+// saturation.

+static INLINE __m256i load_tran_low(const tran_low_t *a) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);

+  return _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));

+#else

+  return _mm256_loadu_si256((const __m256i *)a);

+#endif

+}

+#endif  // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_