ref: 14bc7731996cd27f1d2193d2f841e92ae8fad703
dir: /vp8/encoder/arm/neon/subtract_neon.c/
/* * Copyright (c) 2014 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include <arm_neon.h> #include "vp8/encoder/block.h" void vp8_subtract_b_neon( BLOCK *be, BLOCKD *bd, int pitch) { unsigned char *src_ptr, *predictor; int src_stride; int16_t *src_diff; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint16x8_t q10u16, q11u16, q12u16, q13u16; src_ptr = *be->base_src + be->src; src_stride = be->src_stride; predictor = bd->predictor; d0u8 = vld1_u8(src_ptr); src_ptr += src_stride; d2u8 = vld1_u8(src_ptr); src_ptr += src_stride; d4u8 = vld1_u8(src_ptr); src_ptr += src_stride; d6u8 = vld1_u8(src_ptr); d1u8 = vld1_u8(predictor); predictor += pitch; d3u8 = vld1_u8(predictor); predictor += pitch; d5u8 = vld1_u8(predictor); predictor += pitch; d7u8 = vld1_u8(predictor); q10u16 = vsubl_u8(d0u8, d1u8); q11u16 = vsubl_u8(d2u8, d3u8); q12u16 = vsubl_u8(d4u8, d5u8); q13u16 = vsubl_u8(d6u8, d7u8); src_diff = be->src_diff; vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16)); src_diff += pitch; vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16)); src_diff += pitch; vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16)); src_diff += pitch; vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16)); return; } void vp8_subtract_mby_neon( int16_t *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride) { int i; uint8x16_t q0u8, q1u8, q2u8, q3u8; uint16x8_t q8u16, q9u16, q10u16, q11u16; for (i = 0; i < 8; i++) { // subtract_mby_loop q0u8 = vld1q_u8(src); src += src_stride; q2u8 = vld1q_u8(src); src += src_stride; q1u8 = vld1q_u8(pred); pred += pred_stride; q3u8 = vld1q_u8(pred); pred += pred_stride; q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8)); q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8)); q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8)); q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8)); vst1q_u16((uint16_t *)diff, q8u16); diff += 8; vst1q_u16((uint16_t *)diff, q9u16); diff += 8; vst1q_u16((uint16_t *)diff, q10u16); diff += 8; vst1q_u16((uint16_t *)diff, q11u16); diff += 8; } return; } void vp8_subtract_mbuv_neon( int16_t *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride) { int i, j; unsigned char *src_ptr, *pred_ptr; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint16x8_t q8u16, q9u16, q10u16, q11u16; diff += 256; for (i = 0; i < 2; i++) { if (i == 0) { src_ptr = usrc; pred_ptr = upred; } else if (i == 1) { src_ptr = vsrc; pred_ptr = vpred; } for (j = 0; j < 2; j++) { d0u8 = vld1_u8(src_ptr); src_ptr += src_stride; d1u8 = vld1_u8(pred_ptr); pred_ptr += pred_stride; d2u8 = vld1_u8(src_ptr); src_ptr += src_stride; d3u8 = vld1_u8(pred_ptr); pred_ptr += pred_stride; d4u8 = vld1_u8(src_ptr); src_ptr += src_stride; d5u8 = vld1_u8(pred_ptr); pred_ptr += pred_stride; d6u8 = vld1_u8(src_ptr); src_ptr += src_stride; d7u8 = vld1_u8(pred_ptr); pred_ptr += pred_stride; q8u16 = vsubl_u8(d0u8, d1u8); q9u16 = vsubl_u8(d2u8, d3u8); q10u16 = vsubl_u8(d4u8, d5u8); q11u16 = vsubl_u8(d6u8, d7u8); vst1q_u16((uint16_t *)diff, q8u16); diff += 8; vst1q_u16((uint16_t *)diff, q9u16); diff += 8; vst1q_u16((uint16_t *)diff, q10u16); diff += 8; vst1q_u16((uint16_t *)diff, q11u16); diff += 8; } } return; }