shithub: libvpx

Download patch

ref: a2c69af50ed31a3f56c46ef825098f9acb740a9b
parent: 5d1d72df163a5e6118d3bff641a71d8f2da51330
parent: ce88d74d349da857828f0b7e9c1ce44cf7f56530
author: James Zern <[email protected]>
date: Thu Jun 18 23:27:22 EDT 2015

Merge "vp9_reconintra_neon: add d45 4x4"

--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -211,8 +211,9 @@
 INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon,
                 vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon,
                 vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon,
-                vp9_h_predictor_4x4_neon, NULL, vp9_d135_predictor_4x4_neon,
-                NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
+                vp9_h_predictor_4x4_neon, vp9_d45_predictor_4x4_neon,
+                vp9_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_4x4_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -315,6 +315,31 @@
 
 // -----------------------------------------------------------------------------
 
+void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+// -----------------------------------------------------------------------------
+
 void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -60,7 +60,7 @@
 specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";