shithub: libvpx

--- a/test/pp_filter_test.cc

+++ b/test/pp_filter_test.cc

@@ -595,6 +595,9 @@

 INSTANTIATE_TEST_CASE_P(

     NEON, VpxPostProcDownAndAcrossMbRowTest,

     ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon));

+INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest,

+                        ::testing::Values(vpx_mbpost_proc_across_ip_neon));

 #endif  // HAVE_NEON

 #if HAVE_MSA

--- a/vpx_dsp/arm/deblock_neon.c

+++ b/vpx_dsp/arm/deblock_neon.c

@@ -257,3 +257,130 @@

     dst_ptr += 8 * dst_stride;

+// sum += x;

+// sumsq += x * y;

+static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,

+                                 int16x4_t *const sum, int32x4_t *const sumsq) {

+  const int16x4_t zero = vdup_n_s16(0);

+  const int32x4_t zeroq = vdupq_n_s32(0);

+  // Add in the first set because vext doesn't work with '0'.

+  *sum = vadd_s16(*sum, x);

+  *sumsq = vaddq_s32(*sumsq, xy);

+  // Shift x and xy to the right and sum. vext requires an immediate.

+  *sum = vadd_s16(*sum, vext_s16(zero, x, 1));

+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));

+  *sum = vadd_s16(*sum, vext_s16(zero, x, 2));

+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));

+  *sum = vadd_s16(*sum, vext_s16(zero, x, 3));

+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));

+}

+// Generate mask based on (sumsq * 15 - sum * sum < flimit)

+static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,

+                                 const int32x4_t f, const int32x4_t fifteen) {

+  const int32x4_t a = vmulq_s32(sumsq, fifteen);

+  const int32x4_t b = vmlsl_s16(a, sum, sum);

+  const uint32x4_t mask32 = vcltq_s32(b, f);

+  return vmovn_u32(mask32);

+}

+static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,

+                              const int32x4_t sumsq_low,

+                              const int32x4_t sumsq_high, const int32x4_t f) {

+  const int32x4_t fifteen = vdupq_n_s32(15);

+  const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);

+  const uint16x4_t mask16_high =

+      calculate_mask(sum_high, sumsq_high, f, fifteen);

+  return vmovn_u16(vcombine_u16(mask16_low, mask16_high));

+}

+// Apply filter of (8 + sum + s[c]) >> 4.

+static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {

+  const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));

+  const int16x8_t sum_s = vaddq_s16(sum, s16);

+  return vqrshrun_n_s16(sum_s, 4);

+}

+void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,

+                                    int flimit) {

+  int row, col;

+  const int32x4_t f = vdupq_n_s32(flimit);

+  assert(cols % 8 == 0);

+  for (row = 0; row < rows; ++row) {

+    // Sum the first 8 elements, which are extended from s[0].

+    // sumsq gets primed with +16.

+    int sumsq = src[0] * src[0] * 9 + 16;

+    int sum = src[0] * 9;

+    uint8x8_t left_context, s, right_context;

+    int16x4_t sum_low, sum_high;

+    int32x4_t sumsq_low, sumsq_high;

+    // Sum (+square) the next 6 elements.

+    // Skip [0] because it's included above.

+    for (col = 1; col <= 6; ++col) {

+      sumsq += src[col] * src[col];

+      sum += src[col];

+    }

+    // Prime the sums. Later the loop uses the _high values to prime the new

+    // vectors.

+    sumsq_high = vdupq_n_s32(sumsq);

+    sum_high = vdup_n_s16(sum);

+    // Manually extend the left border.

+    left_context = vdup_n_u8(src[0]);

+    for (col = 0; col < cols; col += 8) {

+      uint8x8_t mask, output;

+      int16x8_t x, y;

+      int32x4_t xy_low, xy_high;

+      s = vld1_u8(src + col);

+      if (col + 8 == cols) {

+        // Last row. Extend border.

+        right_context = vdup_n_u8(src[col + 7]);

+      } else {

+        right_context = vld1_u8(src + col + 7);

+      }

+      x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));

+      y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));

+      xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));

+      xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));

+      // Catch up to the last sum'd value.

+      sum_low = vdup_lane_s16(sum_high, 3);

+      sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);

+      accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);

+      // Need to do this sequentially because we need the max value from

+      // sum_low.

+      sum_high = vdup_lane_s16(sum_low, 3);

+      sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);

+      accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);

+      mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);

+      output = filter_pixels(vcombine_s16(sum_low, sum_high), s);

+      output = vbsl_u8(mask, output, s);

+      vst1_u8(src + col, output);

+      left_context = s;

+    }

+    src += pitch;

+  }

+}

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -1753,7 +1753,7 @@

     specialize qw/vpx_mbpost_proc_down sse2 msa/;

     add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";

-    specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;

+    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;

     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";

     specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;