ref: d217c87139a3218d9dc4154782de53b9d0cc1119
parent: e7cac130167c1da6d17caa33e216250d989d0fe8
author: Johann <[email protected]>
date: Mon May 15 12:30:00 EDT 2017
neon variance: special case 4x The sub pixel variance uses a temp buffer which guarantees width == stride. Take advantage of this with the 4x and avoid the very costly lane loads. Change-Id: Ia0c97eb8c29dc8dfa6e51a29dff9b75b3c6726f1
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -83,6 +83,7 @@
static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
uint32_t a;
uint32x4_t a_u32 = vdupq_n_u32(0);
+ if (stride == 4) return vld1q_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
a_u32 = vld1q_lane_u32(&a, a_u32, 0);
@@ -102,6 +103,10 @@
static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
const uint8x16_t a) {
const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+ if (stride == 4) {
+ vst1q_u8(buf, a);
+ return;
+ }
uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
buf += stride;
uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));