shithub: libvpx

--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c

+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c

@@ -22,6 +22,7 @@

 #include "vp9/encoder/vp9_rd.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/vpx_dsp_common.h"

 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,

--- a/vpx_dsp/arm/avg_neon.c

+++ b/vpx_dsp/arm/avg_neon.c

@@ -16,6 +16,7 @@

 #include "vpx/vpx_integer.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {

   const uint32x4_t a = vpaddlq_u16(v_16x8);

--- a/vpx_dsp/arm/fwd_txfm_neon.c

+++ b/vpx_dsp/arm/fwd_txfm_neon.c

@@ -14,6 +14,7 @@

 #include "vpx_dsp/txfm_common.h"

 #include "vpx_dsp/vpx_dsp_common.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,

                       int stride) {

--- a/vpx_dsp/arm/hadamard_neon.c

+++ b/vpx_dsp/arm/hadamard_neon.c

@@ -13,6 +13,7 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx/vpx_integer.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/arm/transpose_neon.h"

 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,

--- a/vpx_dsp/arm/idct16x16_add_neon.c

+++ b/vpx_dsp/arm/idct16x16_add_neon.c

@@ -12,6 +12,7 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/txfm_common.h"

 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,

--- a/vpx_dsp/arm/idct32x32_135_add_neon.c

+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c

@@ -13,6 +13,7 @@

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/txfm_common.h"

--- a/vpx_dsp/arm/idct32x32_34_add_neon.c

+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c

@@ -13,6 +13,7 @@

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/txfm_common.h"

--- a/vpx_dsp/arm/idct32x32_add_neon.c

+++ b/vpx_dsp/arm/idct32x32_add_neon.c

@@ -13,6 +13,7 @@

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/txfm_common.h"

--- a/vpx_dsp/arm/idct4x4_1_add_neon.c

+++ b/vpx_dsp/arm/idct4x4_1_add_neon.c

@@ -12,6 +12,7 @@

 #include <assert.h>

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/inv_txfm.h"

 static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,

--- a/vpx_dsp/arm/idct4x4_add_neon.c

+++ b/vpx_dsp/arm/idct4x4_add_neon.c

@@ -13,6 +13,7 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/txfm_common.h"

 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,

--- a/vpx_dsp/arm/idct8x8_add_neon.c

+++ b/vpx_dsp/arm/idct8x8_add_neon.c

@@ -13,6 +13,7 @@

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/txfm_common.h"

--- a/vpx_dsp/arm/idct_neon.h

+++ b/vpx_dsp/arm/idct_neon.h

@@ -41,58 +41,6 @@

};

 //------------------------------------------------------------------------------

-// Helper functions used to load tran_low_t into int16, narrowing if necessary.

-static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {

-#if CONFIG_VP9_HIGHBITDEPTH

-  const int32x4x2_t v0 = vld2q_s32(buf);

-  const int32x4x2_t v1 = vld2q_s32(buf + 8);

-  const int16x4_t s0 = vmovn_s32(v0.val[0]);

-  const int16x4_t s1 = vmovn_s32(v0.val[1]);

-  const int16x4_t s2 = vmovn_s32(v1.val[0]);

-  const int16x4_t s3 = vmovn_s32(v1.val[1]);

-  int16x8x2_t res;

-  res.val[0] = vcombine_s16(s0, s2);

-  res.val[1] = vcombine_s16(s1, s3);

-  return res;

-#else

-  return vld2q_s16(buf);

-#endif

-}

-static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {

-#if CONFIG_VP9_HIGHBITDEPTH

-  const int32x4_t v0 = vld1q_s32(buf);

-  const int32x4_t v1 = vld1q_s32(buf + 4);

-  const int16x4_t s0 = vmovn_s32(v0);

-  const int16x4_t s1 = vmovn_s32(v1);

-  return vcombine_s16(s0, s1);

-#else

-  return vld1q_s16(buf);

-#endif

-}

-static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {

-#if CONFIG_VP9_HIGHBITDEPTH

-  const int32x4_t v0 = vld1q_s32(buf);

-  return vmovn_s32(v0);

-#else

-  return vld1_s16(buf);

-#endif

-}

-static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {

-#if CONFIG_VP9_HIGHBITDEPTH

-  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));

-  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));

-  vst1q_s32(buf, v0);

-  vst1q_s32(buf + 4, v1);

-#else

-  vst1q_s16(buf, a);

-#endif

-}

-//------------------------------------------------------------------------------

 // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth

 static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {

 #if CONFIG_VP9_HIGHBITDEPTH

--- /dev/null

+++ b/vpx_dsp/arm/mem_neon.h

@@ -1,0 +1,71 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_ARM_MEM_NEON_H_

+#define VPX_DSP_ARM_MEM_NEON_H_

+#include <arm_neon.h>

+#include <assert.h>

+#include <string.h>

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+// Helper functions used to load tran_low_t into int16, narrowing if necessary.

+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const int32x4x2_t v0 = vld2q_s32(buf);

+  const int32x4x2_t v1 = vld2q_s32(buf + 8);

+  const int16x4_t s0 = vmovn_s32(v0.val[0]);

+  const int16x4_t s1 = vmovn_s32(v0.val[1]);

+  const int16x4_t s2 = vmovn_s32(v1.val[0]);

+  const int16x4_t s3 = vmovn_s32(v1.val[1]);

+  int16x8x2_t res;

+  res.val[0] = vcombine_s16(s0, s2);

+  res.val[1] = vcombine_s16(s1, s3);

+  return res;

+#else

+  return vld2q_s16(buf);

+#endif

+}

+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const int32x4_t v0 = vld1q_s32(buf);

+  const int32x4_t v1 = vld1q_s32(buf + 4);

+  const int16x4_t s0 = vmovn_s32(v0);

+  const int16x4_t s1 = vmovn_s32(v1);

+  return vcombine_s16(s0, s1);

+#else

+  return vld1q_s16(buf);

+#endif

+}

+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const int32x4_t v0 = vld1q_s32(buf);

+  return vmovn_s32(v0);

+#else

+  return vld1_s16(buf);

+#endif

+}

+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));

+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));

+  vst1q_s32(buf, v0);

+  vst1q_s32(buf + 4, v1);

+#else

+  vst1q_s16(buf, a);

+#endif

+}

+#endif  // VPX_DSP_ARM_MEM_NEON_H_

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -352,6 +352,7 @@

 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

 # Neon utilities

+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h

 DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h

 # PPC VSX utilities