shithub: libvpx

--- a/vpx_dsp/arm/intrapred_neon.c

+++ b/vpx_dsp/arm/intrapred_neon.c

@@ -20,37 +20,35 @@

 // 'do_above' and 'do_left' facilitate branch removal when inlined.

 static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,

                           const uint8_t *left, int do_above, int do_left) {

-  uint16x8_t sum_top;

-  uint16x8_t sum_left;

-  uint8x8_t dc0;

+  uint16x4_t sum_top;

+  uint16x4_t sum_left;

+  uint16x4_t dc0;

   if (do_above) {

     const uint8x8_t A = vld1_u8(above);  // top row

     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top

-    const uint16x4_t p1 = vpadd_u16(p0, p0);

-    sum_top = vcombine_u16(p1, p1);

+    sum_top = vpadd_u16(p0, p0);

   if (do_left) {

     const uint8x8_t L = vld1_u8(left);   // left border

     const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left

-    const uint16x4_t p1 = vpadd_u16(p0, p0);

-    sum_left = vcombine_u16(p1, p1);

+    sum_left = vpadd_u16(p0, p0);

   if (do_above && do_left) {

-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);

-    dc0 = vrshrn_n_u16(sum, 3);

+    const uint16x4_t sum = vadd_u16(sum_left, sum_top);

+    dc0 = vrshr_n_u16(sum, 3);

   } else if (do_above) {

-    dc0 = vrshrn_n_u16(sum_top, 2);

+    dc0 = vrshr_n_u16(sum_top, 2);

   } else if (do_left) {

-    dc0 = vrshrn_n_u16(sum_left, 2);

+    dc0 = vrshr_n_u16(sum_left, 2);

   } else {

-    dc0 = vdup_n_u8(0x80);

+    dc0 = vdup_n_u16(0x80);

-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);

+    const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0);

     int i;

     for (i = 0; i < 4; ++i) {

       vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);