shithub: libvpx

--- a/vpx_dsp/arm/mem_neon.h

+++ b/vpx_dsp/arm/mem_neon.h

@@ -101,9 +101,9 @@

   if (stride == 4) return vld1_u8(buf);

   memcpy(&a, buf, 4);

   buf += stride;

-  a_u32 = vld1_lane_u32(&a, a_u32, 0);

+  a_u32 = vset_lane_u32(a, a_u32, 0);

   memcpy(&a, buf, 4);

-  a_u32 = vld1_lane_u32(&a, a_u32, 1);

+  a_u32 = vset_lane_u32(a, a_u32, 1);

   return vreinterpret_u8_u32(a_u32);

@@ -127,16 +127,16 @@

   if (stride == 4) return vld1q_u8(buf);

   memcpy(&a, buf, 4);

   buf += stride;

-  a_u32 = vld1q_lane_u32(&a, a_u32, 0);

+  a_u32 = vsetq_lane_u32(a, a_u32, 0);

   memcpy(&a, buf, 4);

   buf += stride;

-  a_u32 = vld1q_lane_u32(&a, a_u32, 1);

+  a_u32 = vsetq_lane_u32(a, a_u32, 1);

   memcpy(&a, buf, 4);

   buf += stride;

-  a_u32 = vld1q_lane_u32(&a, a_u32, 2);

+  a_u32 = vsetq_lane_u32(a, a_u32, 2);

   memcpy(&a, buf, 4);

   buf += stride;

-  a_u32 = vld1q_lane_u32(&a, a_u32, 3);

+  a_u32 = vsetq_lane_u32(a, a_u32, 3);

   return vreinterpretq_u8_u32(a_u32);

--- a/vpx_dsp/arm/sad4d_neon.c

+++ b/vpx_dsp/arm/sad4d_neon.c

@@ -10,6 +10,7 @@

 #include <arm_neon.h>

+#include <assert.h>

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

 #include "vpx/vpx_integer.h"

@@ -16,58 +17,145 @@

 #include "vpx_dsp/arm/mem_neon.h"

 #include "vpx_dsp/arm/sum_neon.h"

+static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,

+                                                 const void *const buf1) {

+  uint32_t a;

+  uint32x2_t aa = vdup_n_u32(0);

+  memcpy(&a, buf0, 4);

+  aa = vset_lane_u32(a, aa, 0);

+  memcpy(&a, buf1, 4);

+  aa = vset_lane_u32(a, aa, 1);

+  return vreinterpret_u8_u32(aa);

+}

+static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,

+                            const uint8_t *const ref[4], const int ref_stride,

+                            const int height, uint32_t *const res) {

+  int i;

+  uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };

+  uint16x4_t a[2];

+  uint32x4_t r;

+  assert(!((intptr_t)src % sizeof(uint32_t)));

+  assert(!(src_stride % sizeof(uint32_t)));

+  for (i = 0; i < height; ++i) {

+    const uint8x8_t s = vreinterpret_u8_u32(

+        vld1_dup_u32((const uint32_t *)(src + i * src_stride)));

+    const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride,

+                                                     ref[1] + i * ref_stride);

+    const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride,

+                                                     ref[3] + i * ref_stride);

+    abs[0] = vabal_u8(abs[0], s, ref01);

+    abs[1] = vabal_u8(abs[1], s, ref23);

+  }

+  a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));

+  a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));

+  r = vpaddlq_u16(vcombine_u16(a[0], a[1]));

+  vst1q_u32(res, r);

+}

 void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,

                         const uint8_t *const ref[4], int ref_stride,

                         uint32_t *res) {

-  int i;

-  const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride);

-  for (i = 0; i < 4; ++i) {

-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride);

-    uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));

-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));

-    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);

-  }

+  sad4x_4d(src, src_stride, ref, ref_stride, 4, res);

 void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,

                         const uint8_t *const ref[4], int ref_stride,

                         uint32_t *res) {

-  int i;

-  const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride);

-  const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride);

-  for (i = 0; i < 4; ++i) {

-    const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride);

-    const uint8x16_t ref_1 =

-        load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride);

-    uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0));

-    abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0));

-    abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1));

-    abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1));

-    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);

-  }

+  sad4x_4d(src, src_stride, ref, ref_stride, 8, res);

-static INLINE void sad8x_4d(const uint8_t *a, int a_stride,

-                            const uint8_t *const b[4], int b_stride,

-                            uint32_t *result, const int height) {

+////////////////////////////////////////////////////////////////////////////////

+// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)

+static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,

+                                          uint32_t *const res) {

+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));

+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));

+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));

+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));

+  const uint16x4_t b0 = vpadd_u16(a0, a1);

+  const uint16x4_t b1 = vpadd_u16(a2, a3);

+  const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));

+  vst1q_u32(res, r);

+}

+// Can handle 1024 pixels' sad sum (such as 32x32)

+static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,

+                                           uint32_t *const res) {

+  const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));

+  const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));

+  const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));

+  const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));

+  const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));

+  const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));

+  const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));

+  const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));

+  vst1q_u32(res, vcombine_u32(c0, c1));

+}

+// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)

+static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,

+                                           uint32_t *const res) {

+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);

+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);

+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);

+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);

+  const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));

+  const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));

+  const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));

+  const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));

+  const uint32x2_t c0 = vpadd_u32(b0, b1);

+  const uint32x2_t c1 = vpadd_u32(b2, b3);

+  vst1q_u32(res, vcombine_u32(c0, c1));

+}

+// Can handle 4096 pixels' sad sum (such as 64x64)

+static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,

+                                           uint32_t *const res) {

+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);

+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);

+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);

+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);

+  const uint32x4_t a4 = vpaddlq_u16(sum[4]);

+  const uint32x4_t a5 = vpaddlq_u16(sum[5]);

+  const uint32x4_t a6 = vpaddlq_u16(sum[6]);

+  const uint32x4_t a7 = vpaddlq_u16(sum[7]);

+  const uint32x4_t b0 = vaddq_u32(a0, a1);

+  const uint32x4_t b1 = vaddq_u32(a2, a3);

+  const uint32x4_t b2 = vaddq_u32(a4, a5);

+  const uint32x4_t b3 = vaddq_u32(a6, a7);

+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));

+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));

+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));

+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));

+  const uint32x2_t d0 = vpadd_u32(c0, c1);

+  const uint32x2_t d1 = vpadd_u32(c2, c3);

+  vst1q_u32(res, vcombine_u32(d0, d1));

+}

+static INLINE void sad8x_4d(const uint8_t *src, int src_stride,

+                            const uint8_t *const ref[4], int ref_stride,

+                            uint32_t *res, const int height) {

   int i, j;

+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

                         vdupq_n_u16(0) };

-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };

   for (i = 0; i < height; ++i) {

-    const uint8x8_t a_u8 = vld1_u8(a);

-    a += a_stride;

+    const uint8x8_t s = vld1_u8(src);

+    src += src_stride;

     for (j = 0; j < 4; ++j) {

-      const uint8x8_t b_u8 = vld1_u8(b_loop[j]);

-      b_loop[j] += b_stride;

-      sum[j] = vabal_u8(sum[j], a_u8, b_u8);

+      const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);

+      ref_loop[j] += ref_stride;

+      sum[j] = vabal_u8(sum[j], s, b_u8);

-  for (j = 0; j < 4; ++j) {

-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);

-  }

+  sad_512_pel_final_neon(sum, res);

 void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,

@@ -88,28 +176,33 @@

   sad8x_4d(src, src_stride, ref, ref_stride, res, 16);

-static INLINE void sad16x_4d(const uint8_t *a, int a_stride,

-                             const uint8_t *const b[4], int b_stride,

-                             uint32_t *result, const int height) {

+////////////////////////////////////////////////////////////////////////////////

+static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src,

+                              uint16x8_t *const sum) {

+  const uint8x16_t r = vld1q_u8(ref);

+  *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r));

+  *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r));

+}

+static INLINE void sad16x_4d(const uint8_t *src, int src_stride,

+                             const uint8_t *const ref[4], int ref_stride,

+                             uint32_t *res, const int height) {

   int i, j;

+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

                         vdupq_n_u16(0) };

-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_u8 = vld1q_u8(a);

-    a += a_stride;

+    const uint8x16_t s = vld1q_u8(src);

+    src += src_stride;

     for (j = 0; j < 4; ++j) {

-      const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);

-      b_loop[j] += b_stride;

-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));

-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));

+      sad16_neon(ref_loop[j], s, &sum[j]);

+      ref_loop[j] += ref_stride;

-  for (j = 0; j < 4; ++j) {

-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);

-  }

+  sad_512_pel_final_neon(sum, res);

 void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,

@@ -130,31 +223,36 @@

   sad16x_4d(src, src_stride, ref, ref_stride, res, 32);

-static INLINE void sad32x_4d(const uint8_t *a, int a_stride,

-                             const uint8_t *const b[4], int b_stride,

-                             uint32_t *result, const int height) {

-  int i, j;

-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

-                        vdupq_n_u16(0) };

-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };

+////////////////////////////////////////////////////////////////////////////////

+static INLINE void sad32x_4d(const uint8_t *src, int src_stride,

+                             const uint8_t *const ref[4], int ref_stride,

+                             const int height, uint16x8_t *const sum) {

+  int i;

+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_0 = vld1q_u8(a);

-    const uint8x16_t a_1 = vld1q_u8(a + 16);

-    a += a_stride;

-    for (j = 0; j < 4; ++j) {

-      const uint8x16_t b_0 = vld1q_u8(b_loop[j]);

-      const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16);

-      b_loop[j] += b_stride;

-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0));

-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0));

-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1));

-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1));

-    }

-  }

+    uint8x16_t s;

-  for (j = 0; j < 4; ++j) {

-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);

+    s = vld1q_u8(src + 0 * 16);

+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);

+    s = vld1q_u8(src + 1 * 16);

+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);

+    src += src_stride;

+    ref_loop[0] += ref_stride;

+    ref_loop[1] += ref_stride;

+    ref_loop[2] += ref_stride;

+    ref_loop[3] += ref_stride;

@@ -161,82 +259,116 @@

 void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,

                           const uint8_t *const ref[4], int ref_stride,

                           uint32_t *res) {

-  sad32x_4d(src, src_stride, ref, ref_stride, res, 16);

+  uint16x8_t sum[4];

+  sad32x_4d(src, src_stride, ref, ref_stride, 16, sum);

+  sad_512_pel_final_neon(sum, res);

 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,

                           const uint8_t *const ref[4], int ref_stride,

                           uint32_t *res) {

-  sad32x_4d(src, src_stride, ref, ref_stride, res, 32);

+  uint16x8_t sum[4];

+  sad32x_4d(src, src_stride, ref, ref_stride, 32, sum);

+  sad_1024_pel_final_neon(sum, res);

 void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,

                           const uint8_t *const ref[4], int ref_stride,

                           uint32_t *res) {

-  sad32x_4d(src, src_stride, ref, ref_stride, res, 64);

+  uint16x8_t sum[4];

+  sad32x_4d(src, src_stride, ref, ref_stride, 64, sum);

+  sad_2048_pel_final_neon(sum, res);

-static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1,

-                          const uint8x16_t b_0, const uint8x16_t b_1,

-                          uint16x8_t *sum) {

-  *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0));

-  *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0));

-  *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1));

-  *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1));

-}

-static INLINE void sad64x_4d(const uint8_t *a, int a_stride,

-                             const uint8_t *const b[4], int b_stride,

-                             uint32_t *result, const int height) {

+////////////////////////////////////////////////////////////////////////////////

+void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,

+                          const uint8_t *const ref[4], int ref_stride,

+                          uint32_t *res) {

   int i;

-  uint16x8_t sum_0 = vdupq_n_u16(0);

-  uint16x8_t sum_1 = vdupq_n_u16(0);

-  uint16x8_t sum_2 = vdupq_n_u16(0);

-  uint16x8_t sum_3 = vdupq_n_u16(0);

-  uint16x8_t sum_4 = vdupq_n_u16(0);

-  uint16x8_t sum_5 = vdupq_n_u16(0);

-  uint16x8_t sum_6 = vdupq_n_u16(0);

-  uint16x8_t sum_7 = vdupq_n_u16(0);

-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };

+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

+                        vdupq_n_u16(0) };

-  for (i = 0; i < height; ++i) {

-    const uint8x16_t a_0 = vld1q_u8(a);

-    const uint8x16_t a_1 = vld1q_u8(a + 16);

-    const uint8x16_t a_2 = vld1q_u8(a + 32);

-    const uint8x16_t a_3 = vld1q_u8(a + 48);

-    a += a_stride;

-    sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0);

-    sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48),

-           &sum_1);

-    b_loop[0] += b_stride;

-    sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2);

-    sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48),

-           &sum_3);

-    b_loop[1] += b_stride;

-    sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4);

-    sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48),

-           &sum_5);

-    b_loop[2] += b_stride;

-    sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6);

-    sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48),

-           &sum_7);

-    b_loop[3] += b_stride;

+  for (i = 0; i < 32; ++i) {

+    uint8x16_t s;

+    s = vld1q_u8(src + 0 * 16);

+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);

+    s = vld1q_u8(src + 1 * 16);

+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);

+    s = vld1q_u8(src + 2 * 16);

+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);

+    s = vld1q_u8(src + 3 * 16);

+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);

+    src += src_stride;

+    ref_loop[0] += ref_stride;

+    ref_loop[1] += ref_stride;

+    ref_loop[2] += ref_stride;

+    ref_loop[3] += ref_stride;

-  result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0);

-  result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0);

-  result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0);

-  result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0);

+  sad_2048_pel_final_neon(sum, res);

-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

-                          uint32_t *res) {

-  sad64x_4d(src, src_stride, ref, ref_stride, res, 32);

-}

 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,

                           const uint8_t *const ref[4], int ref_stride,

                           uint32_t *res) {

-  sad64x_4d(src, src_stride, ref, ref_stride, res, 64);

+  int i;

+  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

+                        vdupq_n_u16(0), vdupq_n_u16(0) };

+  for (i = 0; i < 64; ++i) {

+    uint8x16_t s;

+    s = vld1q_u8(src + 0 * 16);

+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);

+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);

+    s = vld1q_u8(src + 1 * 16);

+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);

+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);

+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);

+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);

+    s = vld1q_u8(src + 2 * 16);

+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);

+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);

+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);

+    s = vld1q_u8(src + 3 * 16);

+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);

+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);

+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);

+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);

+    src += src_stride;

+    ref_loop[0] += ref_stride;

+    ref_loop[1] += ref_stride;

+    ref_loop[2] += ref_stride;

+    ref_loop[3] += ref_stride;

+  }

+  sad_4096_pel_final_neon(sum, res);

--- a/vpx_dsp/arm/sum_neon.h

+++ b/vpx_dsp/arm/sum_neon.h

@@ -30,15 +30,6 @@

                   vreinterpret_u32_u64(vget_high_u64(c)));

-static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a,

-                                                      const uint16x8_t b) {

-  const uint32x4_t c = vpaddlq_u16(a);

-  const uint32x4_t d = vpadalq_u16(c, b);

-  const uint64x2_t e = vpaddlq_u32(d);

-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)),

-                  vreinterpret_u32_u64(vget_high_u64(e)));

-}

 static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {

   const uint64x2_t b = vpaddlq_u32(a);

   return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),