ref: f9efbad392f001c59a38733f61e53611348f7fc5
parent: 5d881770e59498075218e81244b9a07b774bab5a
author: Linfeng Zhang <[email protected]>
date: Fri Aug 12 14:14:21 EDT 2016
NEON asm of vpx_lpf_{horizontal,vertical}_8_dual_neon() Also expose the NEON intrinsics version. BUG=webm:1261, webm:1266. Change-Id: I8c4ae658467dcf66ebf7a75982b2ef712dbb4535
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -531,20 +531,16 @@
make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
-INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
- ::testing::Values(
-// Using #if inside the macro is unsupported on MSVS but the tests are not
-// currently built for MSVS with ARM and NEON.
-#if HAVE_NEON_ASM
- make_tuple(&vpx_lpf_horizontal_8_dual_neon,
- &vpx_lpf_horizontal_8_dual_c, 8),
- make_tuple(&vpx_lpf_vertical_8_dual_neon,
- &vpx_lpf_vertical_8_dual_c, 8),
-#endif // HAVE_NEON_ASM
- make_tuple(&vpx_lpf_horizontal_4_dual_neon,
- &vpx_lpf_horizontal_4_dual_c, 8),
- make_tuple(&vpx_lpf_vertical_4_dual_neon,
- &vpx_lpf_vertical_4_dual_c, 8)));
+INSTANTIATE_TEST_CASE_P(
+ NEON, Loop8Test9Param,
+ ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+ &vpx_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&vpx_lpf_vertical_8_dual_neon,
+ &vpx_lpf_vertical_8_dual_c, 8),
+ make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+ &vpx_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&vpx_lpf_vertical_4_dual_neon,
+ &vpx_lpf_vertical_4_dual_c, 8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -9,7 +9,9 @@
;
EXPORT |vpx_lpf_horizontal_8_neon|
+ EXPORT |vpx_lpf_horizontal_8_dual_neon|
EXPORT |vpx_lpf_vertical_8_neon|
+ EXPORT |vpx_lpf_vertical_8_dual_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -64,6 +66,38 @@
ENDP ; |vpx_lpf_horizontal_8_neon|
+;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
+; int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp + 4 const uint8_t *blimit1,
+; sp + 8 const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_horizontal_8_dual_neon| PROC
+ push {r0-r1, lr}
+ ldr lr, [sp, #12]
+ push {lr} ; thresh0
+ bl vpx_lpf_horizontal_8_neon
+
+ ldr r2, [sp, #20] ; blimit1
+ ldr r3, [sp, #24] ; limit1
+ ldr lr, [sp, #28]
+ str lr, [sp, #16] ; thresh1
+ add sp, #4
+ pop {r0-r1, lr}
+ add r0, #8 ; s + 8
+ b vpx_lpf_horizontal_8_neon
+ ENDP ; |vpx_lpf_horizontal_8_dual_neon|
+
; void vpx_lpf_vertical_8_neon(uint8_t *s,
; int pitch,
; const uint8_t *blimit,
@@ -138,6 +172,38 @@
pop {r4-r5, pc}
ENDP ; |vpx_lpf_vertical_8_neon|
+
+;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int pitch
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp + 4 const uint8_t *blimit1,
+; sp + 8 const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_vertical_8_dual_neon| PROC
+ push {r0-r1, lr}
+ ldr lr, [sp, #12]
+ push {lr} ; thresh0
+ bl vpx_lpf_vertical_8_neon
+
+ ldr r2, [sp, #20] ; blimit1
+ ldr r3, [sp, #24] ; limit1
+ ldr lr, [sp, #28]
+ str lr, [sp, #16] ; thresh1
+ add sp, #4
+ pop {r0-r1, lr}
+ add r0, r1, lsl #3 ; s + 8 * pitch
+ b vpx_lpf_vertical_8_neon
+ ENDP ; |vpx_lpf_vertical_8_dual_neon|
; void vpx_mbloop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -311,6 +311,14 @@
return;
}
+void vpx_lpf_horizontal_8_dual_neon(
+ uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+ const uint8_t *limit1, const uint8_t *thresh1) {
+ vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
@@ -426,4 +434,12 @@
vst2_lane_u8(s, d2Result, 7);
}
return;
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+ vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -21,21 +21,3 @@
vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
-
-#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(
- uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
- const uint8_t *limit1, const uint8_t *thresh1) {
- vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
- vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-#endif // HAVE_NEON_ASM
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -514,8 +514,7 @@
specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
@@ -533,8 +532,7 @@
specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;