ref: 300a3bfc7395d04d55837b1d065cf2858ce7a2ae
parent: 4b3e44f91d2742319556a247647fb95d52b3c24c
author: James Yu <[email protected]>
date: Tue Dec 17 13:28:11 EST 2013
VP8 for ARMv8 by using NEON intrinsics 02 Add copymem_neon.c - vp8_copy_mem16x16_neon - vp8_copy_mem8x8_neon - vp8_copy_mem8x4_neon vpxdec --summary --noblit ../videos/tears_of_steel_1080p.webm Before => After, 13.25 => 13.25 (fps) Change-Id: Ib956b5a20522ff57dc8a580bf0aef7b252bddba6 Signed-off-by: James Yu <[email protected]>
--- a/vp8/common/arm/neon/copymem16x16_neon.asm
+++ /dev/null
@@ -1,59 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem16x16_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem16x16_neon| PROC
-
- vld1.u8 {q0}, [r0], r1
- vld1.u8 {q1}, [r0], r1
- vld1.u8 {q2}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- vld1.u8 {q3}, [r0], r1
- vst1.u8 {q1}, [r2], r3
- vld1.u8 {q4}, [r0], r1
- vst1.u8 {q2}, [r2], r3
- vld1.u8 {q5}, [r0], r1
- vst1.u8 {q3}, [r2], r3
- vld1.u8 {q6}, [r0], r1
- vst1.u8 {q4}, [r2], r3
- vld1.u8 {q7}, [r0], r1
- vst1.u8 {q5}, [r2], r3
- vld1.u8 {q8}, [r0], r1
- vst1.u8 {q6}, [r2], r3
- vld1.u8 {q9}, [r0], r1
- vst1.u8 {q7}, [r2], r3
- vld1.u8 {q10}, [r0], r1
- vst1.u8 {q8}, [r2], r3
- vld1.u8 {q11}, [r0], r1
- vst1.u8 {q9}, [r2], r3
- vld1.u8 {q12}, [r0], r1
- vst1.u8 {q10}, [r2], r3
- vld1.u8 {q13}, [r0], r1
- vst1.u8 {q11}, [r2], r3
- vld1.u8 {q14}, [r0], r1
- vst1.u8 {q12}, [r2], r3
- vld1.u8 {q15}, [r0], r1
- vst1.u8 {q13}, [r2], r3
- vst1.u8 {q14}, [r2], r3
- vst1.u8 {q15}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp8_copy_mem16x16_neon|
-
- END
--- a/vp8/common/arm/neon/copymem8x4_neon.asm
+++ /dev/null
@@ -1,34 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem8x4_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x4_neon| PROC
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r0], r1
- vst1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r0], r1
- vst1.u8 {d2}, [r2], r3
- vst1.u8 {d3}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp8_copy_mem8x4_neon|
-
- END
--- a/vp8/common/arm/neon/copymem8x8_neon.asm
+++ /dev/null
@@ -1,43 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_copy_mem8x8_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x8_neon| PROC
-
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r0], r1
- vst1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r0], r1
- vst1.u8 {d2}, [r2], r3
- vld1.u8 {d4}, [r0], r1
- vst1.u8 {d3}, [r2], r3
- vld1.u8 {d5}, [r0], r1
- vst1.u8 {d4}, [r2], r3
- vld1.u8 {d6}, [r0], r1
- vst1.u8 {d5}, [r2], r3
- vld1.u8 {d7}, [r0], r1
- vst1.u8 {d6}, [r2], r3
- vst1.u8 {d7}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp8_copy_mem8x8_neon|
-
- END
--- /dev/null
+++ b/vp8/common/arm/neon/copymem_neon.c
@@ -1,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_copy_mem8x4_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 4; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem8x8_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ uint8x8_t vtmp;
+ int r;
+
+ for (r = 0; r < 8; r++) {
+ vtmp = vld1_u8(src);
+ vst1_u8(dst, vtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_copy_mem16x16_neon(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+ uint8x16_t qtmp;
+
+ for (r = 0; r < 16; r++) {
+ qtmp = vld1q_u8(src);
+ vst1q_u8(dst, qtmp);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -159,9 +159,6 @@
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
# common (neon)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem16x16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon$(ASM)
@@ -189,6 +186,7 @@
# common (neon intrinsics)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))