shithub: libvpx

Download patch

ref: 40ab0424d4884190a3450b1e7edf8155c60f011d
parent: f5141ea45fee014b529a264c1fa3bdedd42d038c
author: Linfeng Zhang <[email protected]>
date: Thu Oct 27 12:06:07 EDT 2016

Add high bitdepth intra prediction NEON optimization (mode d45 and d135)

BUG=webm:1316

Change-Id: I6a330874348df04df24a6d9efdc06f567e04bf8e

--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -480,26 +480,34 @@
                        vpx_highbd_dc_predictor_4x4_neon,
                        vpx_highbd_dc_left_predictor_4x4_neon,
                        vpx_highbd_dc_top_predictor_4x4_neon,
-                       vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+                       vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL,
+                       vpx_highbd_d45_predictor_4x4_neon,
+                       vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL,
+                       NULL, NULL)
 HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred8,
                        vpx_highbd_dc_predictor_8x8_neon,
                        vpx_highbd_dc_left_predictor_8x8_neon,
                        vpx_highbd_dc_top_predictor_8x8_neon,
-                       vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+                       vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL,
+                       vpx_highbd_d45_predictor_8x8_neon,
+                       vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL,
+                       NULL, NULL)
 HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
                        vpx_highbd_dc_predictor_16x16_neon,
                        vpx_highbd_dc_left_predictor_16x16_neon,
                        vpx_highbd_dc_top_predictor_16x16_neon,
-                       vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+                       vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL,
+                       vpx_highbd_d45_predictor_16x16_neon,
+                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
+                       NULL, NULL)
 HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
                        vpx_highbd_dc_predictor_32x32_neon,
                        vpx_highbd_dc_left_predictor_32x32_neon,
                        vpx_highbd_dc_top_predictor_32x32_neon,
-                       vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL, NULL, NULL)
+                       vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL,
+                       vpx_highbd_d45_predictor_32x32_neon,
+                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
+                       NULL, NULL)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -416,6 +416,22 @@
 INSTANTIATE_TEST_CASE_P(
     NEON_TO_C_8, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -452,6 +468,22 @@
 INSTANTIATE_TEST_CASE_P(
     NEON_TO_C_10, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -488,6 +520,22 @@
 INSTANTIATE_TEST_CASE_P(
     NEON_TO_C_12, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -283,3 +283,413 @@
   (void)left;
   dc_store_32x32(dst, stride, dc);
 }
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t ABCDEFGH = vld1q_u16(above);
+  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
+  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
+  const uint16x4_t avg2_low = vget_low_u16(avg2);
+  const uint16x4_t avg2_high = vget_high_u16(avg2);
+  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
+  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
+  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  (void)left;
+  (void)bd;
+  vst1_u16(dst, avg2_low);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, r3);
+  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+                               const uint16x8_t above_right, uint16x8_t *row) {
+  *row = vextq_u16(*row, above_right, 1);
+  vst1q_u16(*dst, *row);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t A0 = vld1q_u16(above);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
+  const uint16x8_t A1 = vld1q_u16(above + 1);
+  const uint16x8_t A2 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
+  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row);
+  dst += stride;
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  vst1q_u16(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                const uint16x8_t above_right, uint16x8_t *row_0,
+                                uint16x8_t *row_1) {
+  *row_0 = vextq_u16(*row_0, *row_1, 1);
+  *row_1 = vextq_u16(*row_1, above_right, 1);
+  vst1q_u16(*dst, *row_0);
+  *dst += 8;
+  vst1q_u16(*dst, *row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  vst1q_u16(dst + 8, row_1);
+  dst += stride;
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  vst1q_u16(dst, above_right);
+  vst1q_u16(dst + 8, above_right);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t A0_2 = vld1q_u16(above + 16);
+  const uint16x8_t A0_3 = vld1q_u16(above + 24);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A1_2 = vld1q_u16(above + 17);
+  const uint16x8_t A1_3 = vld1q_u16(above + 25);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t A2_2 = vld1q_u16(above + 18);
+  const uint16x8_t A2_3 = vld1q_u16(above + 26);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
+  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  int i;
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+  dst += 8;
+  vst1q_u16(dst, row_2);
+  dst += 8;
+  vst1q_u16(dst, row_3);
+  dst += stride - 24;
+
+  for (i = 0; i < 30; ++i) {
+    row_0 = vextq_u16(row_0, row_1, 1);
+    row_1 = vextq_u16(row_1, row_2, 1);
+    row_2 = vextq_u16(row_2, row_3, 1);
+    row_3 = vextq_u16(row_3, above_right, 1);
+    vst1q_u16(dst, row_0);
+    dst += 8;
+    vst1q_u16(dst, row_1);
+    dst += 8;
+    vst1q_u16(dst, row_2);
+    dst += 8;
+    vst1q_u16(dst, row_3);
+    dst += stride - 24;
+  }
+
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+  const uint16x4_t L0123 = vld1_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(L0123);
+  const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+  const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+  const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+  const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+  const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+  const uint16x4_t row_0 = vget_low_u16(avg2);
+  const uint16x4_t row_1 = vget_high_u16(avg2);
+  const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+  const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+  const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1_u16(dst, r0);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+  const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1q_u16(dst, r0);
+  dst += stride;
+  vst1q_u16(dst, r1);
+  dst += stride;
+  vst1q_u16(dst, r2);
+  dst += stride;
+  vst1q_u16(dst, r3);
+  dst += stride;
+  vst1q_u16(dst, r4);
+  dst += stride;
+  vst1q_u16(dst, r5);
+  dst += stride;
+  vst1q_u16(dst, r6);
+  dst += stride;
+  vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                 const uint16x8_t row_0,
+                                 const uint16x8_t row_1) {
+  vst1q_u16(*dst, row_0);
+  *dst += 8;
+  vst1q_u16(*dst, row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+  const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+  const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+  const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+  const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+  const uint16x8_t A789abcde = vld1q_u16(above + 7);
+  const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+  const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+  const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+  const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+  const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+  const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+  const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+  const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+  const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+  const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+  const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+  const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+  const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+  const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+  const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+  const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+  const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+  const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+
+  d135_store_16(&dst, stride, r0_0, r0_1);
+  d135_store_16(&dst, stride, r1_0, r1_1);
+  d135_store_16(&dst, stride, r2_0, r2_1);
+  d135_store_16(&dst, stride, r3_0, r3_1);
+  d135_store_16(&dst, stride, r4_0, r4_1);
+  d135_store_16(&dst, stride, r5_0, r5_1);
+  d135_store_16(&dst, stride, r6_0, r6_1);
+  d135_store_16(&dst, stride, row_1, row_2);
+  d135_store_16(&dst, stride, r8_0, r0_0);
+  d135_store_16(&dst, stride, r9_0, r1_0);
+  d135_store_16(&dst, stride, ra_0, r2_0);
+  d135_store_16(&dst, stride, rb_0, r3_0);
+  d135_store_16(&dst, stride, rc_0, r4_0);
+  d135_store_16(&dst, stride, rd_0, r5_0);
+  d135_store_16(&dst, stride, re_0, r6_0);
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+  const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+  const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+  const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+  const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+  const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+  const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+  const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+  const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+  const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+  const uint16x8_t LU01234567 = vld1q_u16(left);
+  const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+  const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+  const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+  const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+  const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+  const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+  const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+  const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+  const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+  const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+  const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+  const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+  const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+  const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+  const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+  const uint16x8_t AL01234567 = vld1q_u16(above);
+  const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+  uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+  const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+  const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+  const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+  uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+  const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+  const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+  const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+  const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+  uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+  const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+  const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+  const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+  const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+  uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+  int i, j;
+  (void)bd;
+
+  dst += 31 * stride;
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 8; ++j) {
+      vst1q_u16(dst, row_0);
+      dst += 8;
+      vst1q_u16(dst, row_1);
+      dst += 8;
+      vst1q_u16(dst, row_2);
+      dst += 8;
+      vst1q_u16(dst, row_3);
+      dst -= stride + 24;
+      row_0 = vextq_u16(row_0, row_1, 1);
+      row_1 = vextq_u16(row_1, row_2, 1);
+      row_2 = vextq_u16(row_2, row_3, 1);
+      row_3 = vextq_u16(row_3, row_4, 1);
+      row_4 = vextq_u16(row_4, row_4, 1);
+    }
+    row_4 = row_5;
+    row_5 = row_6;
+    row_6 = row_7;
+  }
+}
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -211,6 +211,7 @@
   add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -223,6 +224,7 @@
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -249,6 +251,7 @@
   add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -261,6 +264,7 @@
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -287,6 +291,7 @@
   add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -299,6 +304,7 @@
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -325,6 +331,7 @@
   add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -337,6 +344,7 @@
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";