ref: 40ab0424d4884190a3450b1e7edf8155c60f011d
parent: f5141ea45fee014b529a264c1fa3bdedd42d038c
author: Linfeng Zhang <[email protected]>
date: Thu Oct 27 12:06:07 EDT 2016
Add high bitdepth intra prediction NEON optimization (mode d45 and d135) BUG=webm:1316 Change-Id: I6a330874348df04df24a6d9efdc06f567e04bf8e
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -480,26 +480,34 @@
vpx_highbd_dc_predictor_4x4_neon,
vpx_highbd_dc_left_predictor_4x4_neon,
vpx_highbd_dc_top_predictor_4x4_neon,
- vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL,
+ vpx_highbd_d45_predictor_4x4_neon,
+ vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL,
+ NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred8,
vpx_highbd_dc_predictor_8x8_neon,
vpx_highbd_dc_left_predictor_8x8_neon,
vpx_highbd_dc_top_predictor_8x8_neon,
- vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL,
+ vpx_highbd_d45_predictor_8x8_neon,
+ vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL,
+ NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
vpx_highbd_dc_predictor_16x16_neon,
vpx_highbd_dc_left_predictor_16x16_neon,
vpx_highbd_dc_top_predictor_16x16_neon,
- vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL,
+ vpx_highbd_d45_predictor_16x16_neon,
+ vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
+ NULL, NULL)
HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
vpx_highbd_dc_predictor_32x32_neon,
vpx_highbd_dc_left_predictor_32x32_neon,
vpx_highbd_dc_top_predictor_32x32_neon,
- vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL,
+ vpx_highbd_d45_predictor_32x32_neon,
+ vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
+ NULL, NULL)
#endif // HAVE_NEON
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -416,6 +416,22 @@
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_8, VP9HighbdIntraPredTest,
::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+ &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+ &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+ &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+ &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+ &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+ &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+ &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+ &vpx_highbd_d135_predictor_32x32_c, 32, 8),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
&vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -452,6 +468,22 @@
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_10, VP9HighbdIntraPredTest,
::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+ &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+ &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+ &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+ &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+ &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+ &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+ &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+ &vpx_highbd_d135_predictor_32x32_c, 32, 10),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
&vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -488,6 +520,22 @@
INSTANTIATE_TEST_CASE_P(
NEON_TO_C_12, VP9HighbdIntraPredTest,
::testing::Values(
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+ &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+ &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+ &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+ &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+ &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+ &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+ &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+ HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+ &vpx_highbd_d135_predictor_32x32_c, 32, 12),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
&vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -283,3 +283,413 @@
(void)left;
dc_store_32x32(dst, stride, dc);
}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t ABCDEFGH = vld1q_u16(above);
+ const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
+ const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
+ const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
+ const uint16x4_t avg2_low = vget_low_u16(avg2);
+ const uint16x4_t avg2_high = vget_high_u16(avg2);
+ const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
+ const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
+ const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+ (void)left;
+ (void)bd;
+ vst1_u16(dst, avg2_low);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, r3);
+ vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t above_right, uint16x8_t *row) {
+ *row = vextq_u16(*row, above_right, 1);
+ vst1q_u16(*dst, *row);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0 = vld1q_u16(above);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
+ const uint16x8_t A1 = vld1q_u16(above + 1);
+ const uint16x8_t A2 = vld1q_u16(above + 2);
+ const uint16x8_t avg1 = vhaddq_u16(A0, A2);
+ uint16x8_t row = vrhaddq_u16(avg1, A1);
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row);
+ dst += stride;
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ vst1q_u16(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t above_right, uint16x8_t *row_0,
+ uint16x8_t *row_1) {
+ *row_0 = vextq_u16(*row_0, *row_1, 1);
+ *row_1 = vextq_u16(*row_1, above_right, 1);
+ vst1q_u16(*dst, *row_0);
+ *dst += 8;
+ vst1q_u16(*dst, *row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0_0 = vld1q_u16(above);
+ const uint16x8_t A0_1 = vld1q_u16(above + 8);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
+ const uint16x8_t A1_0 = vld1q_u16(above + 1);
+ const uint16x8_t A1_1 = vld1q_u16(above + 9);
+ const uint16x8_t A2_0 = vld1q_u16(above + 2);
+ const uint16x8_t A2_1 = vld1q_u16(above + 10);
+ const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+ const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row_0);
+ vst1q_u16(dst + 8, row_1);
+ dst += stride;
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ vst1q_u16(dst, above_right);
+ vst1q_u16(dst + 8, above_right);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0_0 = vld1q_u16(above);
+ const uint16x8_t A0_1 = vld1q_u16(above + 8);
+ const uint16x8_t A0_2 = vld1q_u16(above + 16);
+ const uint16x8_t A0_3 = vld1q_u16(above + 24);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
+ const uint16x8_t A1_0 = vld1q_u16(above + 1);
+ const uint16x8_t A1_1 = vld1q_u16(above + 9);
+ const uint16x8_t A1_2 = vld1q_u16(above + 17);
+ const uint16x8_t A1_3 = vld1q_u16(above + 25);
+ const uint16x8_t A2_0 = vld1q_u16(above + 2);
+ const uint16x8_t A2_1 = vld1q_u16(above + 10);
+ const uint16x8_t A2_2 = vld1q_u16(above + 18);
+ const uint16x8_t A2_3 = vld1q_u16(above + 26);
+ const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+ const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+ const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
+ const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+ int i;
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst += stride - 24;
+
+ for (i = 0; i < 30; ++i) {
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, above_right, 1);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst += stride - 24;
+ }
+
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+ const uint16x4_t L0123 = vld1_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(L0123);
+ const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+ const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+ const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+ const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+ const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+ const uint16x4_t row_0 = vget_low_u16(avg2);
+ const uint16x4_t row_1 = vget_high_u16(avg2);
+ const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+ const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+ const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1_u16(dst, r0);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+ const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1q_u16(dst, r0);
+ dst += stride;
+ vst1q_u16(dst, r1);
+ dst += stride;
+ vst1q_u16(dst, r2);
+ dst += stride;
+ vst1q_u16(dst, r3);
+ dst += stride;
+ vst1q_u16(dst, r4);
+ dst += stride;
+ vst1q_u16(dst, r5);
+ dst += stride;
+ vst1q_u16(dst, r6);
+ dst += stride;
+ vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row_0,
+ const uint16x8_t row_1) {
+ vst1q_u16(*dst, row_0);
+ *dst += 8;
+ vst1q_u16(*dst, row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+ const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+ const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+ const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+ const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+ const uint16x8_t A789abcde = vld1q_u16(above + 7);
+ const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+ const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+ const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+ const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+ const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+ const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+ const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+ const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+ const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+ const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+ const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+ const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+ const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+ const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+ const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+ const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+ const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+ const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+
+ d135_store_16(&dst, stride, r0_0, r0_1);
+ d135_store_16(&dst, stride, r1_0, r1_1);
+ d135_store_16(&dst, stride, r2_0, r2_1);
+ d135_store_16(&dst, stride, r3_0, r3_1);
+ d135_store_16(&dst, stride, r4_0, r4_1);
+ d135_store_16(&dst, stride, r5_0, r5_1);
+ d135_store_16(&dst, stride, r6_0, r6_1);
+ d135_store_16(&dst, stride, row_1, row_2);
+ d135_store_16(&dst, stride, r8_0, r0_0);
+ d135_store_16(&dst, stride, r9_0, r1_0);
+ d135_store_16(&dst, stride, ra_0, r2_0);
+ d135_store_16(&dst, stride, rb_0, r3_0);
+ d135_store_16(&dst, stride, rc_0, r4_0);
+ d135_store_16(&dst, stride, rd_0, r5_0);
+ d135_store_16(&dst, stride, re_0, r6_0);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+ const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+ const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+ const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+ const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+ const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+ const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+ const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+ const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+ const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+ const uint16x8_t LU01234567 = vld1q_u16(left);
+ const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+ const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+ const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+ const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+ const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+ const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+ const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+ const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+ const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+ const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+ const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+ const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+ const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+ const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+ const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+ const uint16x8_t AL01234567 = vld1q_u16(above);
+ const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+ uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+ const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+ const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+ const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+ uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+ const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+ const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+ const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+ const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+ uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+ const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+ const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+ const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+ const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+ uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+ int i, j;
+ (void)bd;
+
+ dst += 31 * stride;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 8; ++j) {
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst -= stride + 24;
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, row_4, 1);
+ row_4 = vextq_u16(row_4, row_4, 1);
+ }
+ row_4 = row_5;
+ row_5 = row_6;
+ row_6 = row_7;
+ }
+}
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -211,6 +211,7 @@
add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -223,6 +224,7 @@
add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -249,6 +251,7 @@
add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -261,6 +264,7 @@
add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -287,6 +291,7 @@
add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -299,6 +304,7 @@
add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -325,6 +331,7 @@
add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -337,6 +344,7 @@
add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";