shithub: openh264

ref: c8eaea2f27652814716ef7a3e9e559f8038af6e2
dir: /codec/processing/src/arm/adaptive_quantization.S/

View raw version
/*!
 * \copy
 *     Copyright (c)  2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 */

#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"

#ifdef __APPLE__
.macro SQR_ADD_16BYTES
	vmull.u8 q3, $0, $0
	vmull.u8 q8, $1, $1
	vpadal.u16 $2, q3
	vpadal.u16 $2, q8
.endm
#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
	vmull.u8 q3, \arg0, \arg0
	vmull.u8 q8, \arg1, \arg1
	vpadal.u16 \arg2, q3
	vpadal.u16 \arg2, q8
.endm
#endif


WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
    stmdb sp!, {r4}

	vld1.8   {q15}, [r0], r1 //save the ref data (16bytes)
	vld1.8   {q14}, [r2], r3 //save the src data (16bytes)


	vabd.u8  q13, q14, q15
	vmull.u8 q12, d27, d27
	vmull.u8 q11, d26, d26
	vaddl.u16 q12, d24, d25
	vpadal.u16 q12, q11     //sqr

    vaddl.u8 q13, d26, d27 //sum

	vaddl.u8 q10, d28, d29 //sum_cur

	vmull.u8 q9,  d29, d29
	vmull.u8 q8,  d28, d28
	vaddl.u16 q9, d18, d19       //sqr_cur
	vpadal.u16 q9, q8

	mov r4, #15
pixel_var_16x16_loop0:

	vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
	vld1.8 {q1}, [r2], r3 //save the src data (16bytes)

	vabd.u8 q2, q0, q1

	//q10 save sum_cur
	vpadal.u8 q10, q1

	//q12 save sqr
	SQR_ADD_16BYTES d4, d5, q12

    //q13 save sum
	vpadal.u8 q13, q2

	subs r4, #1

	//q9 save sqr_cur
	SQR_ADD_16BYTES d2, d3, q9

	bne pixel_var_16x16_loop0

	vadd.u16 d0, d26, d27 //sum
	vadd.u16 d1, d20, d21 //sum_cur
	vpaddl.u16 q0, q0
	vadd.u32 d2, d24, d25 //sqr
	vadd.u32 d3, d18, d19 //sqr_cur
	vpadd.u32 d0, d0, d1
	vpadd.u32 d1, d2, d3

	ldr       r4, [sp, #4]

	vshr.u32  q0, q0, #8
	vmul.u32  d0, d0
	vsub.u32  d0, d1, d0
    vmovl.u32 q0, d0
	vst2.16  {d0[0], d1[0]}, [r4]

	ldmia sp!, {r4}

WELS_ASM_FUNC_END

#endif