shithub: openh264

--- a/codec/common/arm/copy_mb_neon.S

+++ b/codec/common/arm/copy_mb_neon.S

@@ -35,7 +35,7 @@

 #include "arm_arch_common_macro.S"

 #ifdef __APPLE__

-.macro	LOAD_ALIGNED_DATA_WITH_STRIDE

+.macro LOAD_ALIGNED_DATA_WITH_STRIDE

 //	{	//	input: $0~$3, src*, src_stride

     vld1.64	{$0}, [$4,:128], $5

     vld1.64	{$1}, [$4,:128], $5

@@ -44,7 +44,7 @@

 //	}

 .endm

-.macro	STORE_ALIGNED_DATA_WITH_STRIDE

+.macro STORE_ALIGNED_DATA_WITH_STRIDE

 //	{	//	input: $0~$3, dst*, dst_stride

     vst1.64	{$0}, [$4,:128], $5

     vst1.64	{$1}, [$4,:128], $5

@@ -53,7 +53,7 @@

 //	}

 .endm

-.macro	LOAD_UNALIGNED_DATA_WITH_STRIDE

+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE

 //	{	//	input: $0~$3, src*, src_stride

     vld1.64	{$0}, [$4], $5

     vld1.64	{$1}, [$4], $5

@@ -62,7 +62,7 @@

 //	}

 .endm

-.macro	STORE_UNALIGNED_DATA_WITH_STRIDE

+.macro STORE_UNALIGNED_DATA_WITH_STRIDE

 //	{	//	input: $0~$3, dst*, dst_stride

     vst1.64	{$0}, [$4], $5

     vst1.64	{$1}, [$4], $5

@@ -71,7 +71,7 @@

 //	}

 .endm

 #else

-.macro	LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

 //	{	//	input: \arg0~\arg3, src*, src_stride

     vld1.64	{\arg0}, [\arg4,:128], \arg5

     vld1.64	{\arg1}, [\arg4,:128], \arg5

@@ -80,7 +80,7 @@

 //	}

 .endm

-.macro	STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

 //	{	//	input: \arg0~\arg3, dst*, dst_stride

     vst1.64	{\arg0}, [\arg4,:128], \arg5

     vst1.64	{\arg1}, [\arg4,:128], \arg5

@@ -89,7 +89,7 @@

 //	}

 .endm

-.macro	LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

 //	{	//	input: \arg0~\arg3, src*, src_stride

     vld1.64	{\arg0}, [\arg4], \arg5

     vld1.64	{\arg1}, [\arg4], \arg5

@@ -98,7 +98,7 @@

 //	}

 .endm

-.macro	STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

+.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5

 //	{	//	input: \arg0~\arg3, dst*, dst_stride

     vst1.64	{\arg0}, [\arg4], \arg5

     vst1.64	{\arg1}, [\arg4], \arg5

--- a/codec/common/arm/deblocking_neon.S

+++ b/codec/common/arm/deblocking_neon.S

@@ -36,7 +36,7 @@

 #include "arm_arch_common_macro.S"

 #ifdef __APPLE__

-.macro	JMP_IF_128BITS_IS_ZERO

+.macro JMP_IF_128BITS_IS_ZERO

     vorr.s16	$2, $0, $1

     vmov		r3, r2, $2

     orr			r3, r3, r2

@@ -43,7 +43,7 @@

     cmp			r3, #0

 .endm

-.macro	MASK_MATRIX

+.macro MASK_MATRIX

     vabd.u8	$6, $1, $2

     vcgt.u8	$6, $4, $6

@@ -57,7 +57,7 @@

 .endm

-.macro	DIFF_LUMA_LT4_P1_Q1

+.macro DIFF_LUMA_LT4_P1_Q1

     vmov.i8 $9, #128

     vrhadd.u8	$8, $2, $3

     vhadd.u8	$8, $0, $8

@@ -74,7 +74,7 @@

     vabs.s8	$9, $9

 .endm

-.macro	DIFF_LUMA_LT4_P0_Q0

+.macro DIFF_LUMA_LT4_P0_Q0

     vsubl.u8	$5, $0, $3

     vsubl.u8	$6, $2, $1

     vshl.s16	$6, $6, #2

@@ -82,7 +82,7 @@

     vqrshrn.s16		$4, $5, #3

 .endm

-.macro	DIFF_LUMA_EQ4_P2P1P0

+.macro DIFF_LUMA_EQ4_P2P1P0

     vaddl.u8	q4, $1, $2

     vaddl.u8	q5, $3, $4

     vadd.u16	q5, q4, q5

@@ -107,12 +107,12 @@

     vbsl.u8		$6, d10, d8

 .endm

-.macro	DIFF_LUMA_EQ4_MASK

+.macro DIFF_LUMA_EQ4_MASK

     vmov	$3, $2

     vbsl.u8	$3, $0, $1

 .endm

-.macro	DIFF_CHROMA_EQ4_P0Q0

+.macro DIFF_CHROMA_EQ4_P0Q0

     vaddl.u8	$4, $0, $3

     vaddw.u8	$5, $4, $1

     vaddw.u8	$6, $4, $2

@@ -123,38 +123,38 @@

     vrshrn.u16		$8, $6, #2

 .endm

-.macro	LOAD_CHROMA_DATA_4

+.macro LOAD_CHROMA_DATA_4

     vld4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2

     vld4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2

 .endm

-.macro	STORE_CHROMA_DATA_4

+.macro STORE_CHROMA_DATA_4

     vst4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2

     vst4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2

 .endm

-.macro	LOAD_LUMA_DATA_3

+.macro LOAD_LUMA_DATA_3

     vld3.u8	{$0[$6],$1[$6],$2[$6]}, [r2], r1

     vld3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1

 .endm

-.macro	STORE_LUMA_DATA_4

+.macro STORE_LUMA_DATA_4

     vst4.u8	{$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1

     vst4.u8	{$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1

 .endm

-.macro	STORE_LUMA_DATA_3

+.macro STORE_LUMA_DATA_3

     vst3.u8	{$0[$6],$1[$6],$2[$6]}, [r3], r1

     vst3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1

 .endm

-.macro	EXTRACT_DELTA_INTO_TWO_PART

+.macro EXTRACT_DELTA_INTO_TWO_PART

     vcge.s8	$1, $0, #0

     vand	$1, $0, $1

     vsub.s8	$0, $1, $0

 .endm

 #else

-.macro	JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2

+.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2

     vorr.s16	\arg2, \arg0, \arg1

     vmov		r3, r2, \arg2

     orr			r3, r3, r2

@@ -161,7 +161,7 @@

     cmp			r3, #0

 .endm

-.macro	MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6

+.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6

     vabd.u8	\arg6, \arg1, \arg2

     vcgt.u8	\arg6, \arg4, \arg6

@@ -174,7 +174,7 @@

     vand.u8	\arg6, \arg6, \arg4

 .endm

-.macro	DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

     vmov.i8 \arg9, #128

     vrhadd.u8	\arg8, \arg2, \arg3

     vhadd.u8	\arg8, \arg0, \arg8

@@ -191,7 +191,7 @@

     vabs.s8	\arg9, \arg9

 .endm

-.macro	DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6

+.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6

     vsubl.u8	\arg5, \arg0, \arg3

     vsubl.u8	\arg6, \arg2, \arg1

     vshl.s16	\arg6, \arg6, #2

@@ -200,7 +200,7 @@

 .endm

-.macro	DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

     vaddl.u8	q4, \arg1, \arg2

     vaddl.u8	q5, \arg3, \arg4

     vadd.u16	q5, q4, q5

@@ -225,12 +225,12 @@

     vbsl.u8		\arg6, d10, d8

 .endm

-.macro	DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3

+.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3

     vmov	\arg3, \arg2

     vbsl.u8	\arg3, \arg0, \arg1

 .endm

-.macro	DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

     vaddl.u8	\arg4, \arg0, \arg3

     vaddw.u8	\arg5, \arg4, \arg1

     vaddw.u8	\arg6, \arg4, \arg2

@@ -240,32 +240,32 @@

     vrshrn.u16		\arg8, \arg6, #2

 .endm

-.macro	LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

     vld4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2

     vld4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2

 .endm

-.macro	STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

     vst4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2

     vst4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2

 .endm

-.macro	LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6

+.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6

     vld3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1

     vld3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1

 .endm

-.macro	STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5

+.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5

     vst4.u8	{\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1

     vst4.u8	{\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1

 .endm

-.macro	STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6

+.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6

     vst3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1

     vst3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1

 .endm

-.macro	EXTRACT_DELTA_INTO_TWO_PART arg0, arg1

+.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1

     vcge.s8	\arg1, \arg0, #0

     vand	\arg1, \arg0, \arg1

     vsub.s8	\arg0, \arg1, \arg0

--- a/codec/common/arm/mc_neon.S

+++ b/codec/common/arm/mc_neon.S

@@ -35,7 +35,7 @@

 #include "arm_arch_common_macro.S"

 #ifdef __APPLE__

-.macro	AVERAGE_TWO_8BITS

+.macro AVERAGE_TWO_8BITS

 //	{	// input:dst_d, src_d A and B; working: q13

     vaddl.u8	q13, $2, $1

     vrshrn.u16		$0, q13, #1

@@ -42,7 +42,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS

+.macro FILTER_6TAG_8BITS

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13

     vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]

     vaddl.u8	q13, $2, $3	//src[0]+src[1]

@@ -53,7 +53,7 @@

 //	}

 .endm

-.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used

+.macro FILTER_SINGLE_TAG_8BITS		// when width=17/9, used

 //	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},

     vrev64.8	$2, $0				// X[5][4][3][2][1][0]O

     vaddl.u8	$3, $0, $2			// each 16bits, *[50][41][32][23][14][05]*

@@ -64,7 +64,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0

+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13

     vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]

     vaddl.u8	q13, $2, $3	//src[0]+src[1]

@@ -77,7 +77,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1

+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13

     vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]

     vaddl.u8	q13, $2, $3	//src[0]+src[1]

@@ -90,7 +90,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_TO_16BITS

+.macro FILTER_6TAG_8BITS_TO_16BITS

 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13

     vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]

     vaddl.u8	q13, $2, $3	//src[0]+src[1]

@@ -100,7 +100,7 @@

 //	}

 .endm

-.macro	FILTER_3_IN_16BITS_TO_8BITS

+.macro FILTER_3_IN_16BITS_TO_8BITS

 //	{	// input:a, b, c, dst_d;

     vsub.s16	$0, $0, $1			//a-b

     vshr.s16	$0, $0, #2			//(a-b)/4

@@ -112,7 +112,7 @@

 //	}

 .endm

-.macro	UNPACK_2_16BITS_TO_ABC

+.macro UNPACK_2_16BITS_TO_ABC

 //	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;

     vext.16	$4, $0, $1, #2		//src[0]

     vext.16	$3, $0, $1, #3		//src[1]

@@ -127,7 +127,7 @@

 //	}

 .endm

-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS

+.macro UNPACK_1_IN_8x16BITS_TO_8BITS

 //	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)

     vext.16	$3, $3, $3, #7	// 0x????, [0][1][2][3][4][5],

     vrev64.16	$1, $1

@@ -145,7 +145,7 @@

 //	}

 .endm

 #else

-.macro	AVERAGE_TWO_8BITS arg0, arg1, arg2

+.macro AVERAGE_TWO_8BITS arg0, arg1, arg2

 //	{	// input:dst_d, src_d A and B; working: q13

     vaddl.u8	q13, \arg2, \arg1

     vrshrn.u16		\arg0, q13, #1

@@ -152,7 +152,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13

     vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]

     vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]

@@ -163,7 +163,7 @@

 //	}

 .endm

-.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used

+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used

 //	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}

     vrev64.8	\arg2, \arg0				// X[5][4][3][2][1][0]O

     vaddl.u8	\arg3, \arg0, \arg2			// each 16bits, *[50][41][32][23][14][05]*

@@ -174,7 +174,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13

     vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]

     vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]

@@ -187,7 +187,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13

     vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]

     vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]

@@ -200,7 +200,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13

     vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]

     vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]

@@ -210,7 +210,7 @@

 //	}

 .endm

-.macro	FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3

+.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3

 //	{	// input:a, b, c, dst_d;

     vsub.s16	\arg0, \arg0, \arg1			//a-b

     vshr.s16	\arg0, \arg0, #2			//(a-b)/4

@@ -222,7 +222,7 @@

 //	}

 .endm

-.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4

+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4

 //	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;

     vext.16	\arg4, \arg0, \arg1, #2		//src[0]

     vext.16	\arg3, \arg0, \arg1, #3		//src[1]

@@ -237,7 +237,7 @@

 //	}

 .endm

-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3

+.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3

 //	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)

     vext.16	\arg3, \arg3, \arg3, #7	// 0x????, [0][1][2][3][4][5]

     vrev64.16	\arg1, \arg1

--- a/codec/common/arm64/mc_aarch64_neon.S

+++ b/codec/common/arm64/mc_aarch64_neon.S

@@ -38,7 +38,7 @@

 #ifdef __APPLE__

-.macro	FILTER_6TAG_8BITS1

+.macro FILTER_6TAG_8BITS1

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]

     uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]

@@ -49,7 +49,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS2

+.macro FILTER_6TAG_8BITS2

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]

     uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]

@@ -60,7 +60,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_0

+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]

     uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]

@@ -73,7 +73,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_0

+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]

     uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]

@@ -86,7 +86,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_1

+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]

     uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]

@@ -99,7 +99,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_1

+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]

     uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]

@@ -112,7 +112,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_TO_16BITS1

+.macro FILTER_6TAG_8BITS_TO_16BITS1

 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31

     uaddl	$6.8h, $0.8b, $5.8b		//dst_q=src[-2]+src[3]

     uaddl	v31.8h, $2.8b, $3.8b	//src[0]+src[1]

@@ -122,7 +122,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_TO_16BITS2

+.macro FILTER_6TAG_8BITS_TO_16BITS2

 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31

     uaddl2	$6.8h, $0.16b, $5.16b		//dst_q=src[-2]+src[3]

     uaddl2	v31.8h, $2.16b, $3.16b	//src[0]+src[1]

@@ -132,7 +132,7 @@

 //	}

 .endm

-.macro	FILTER_3_IN_16BITS_TO_8BITS1

+.macro FILTER_3_IN_16BITS_TO_8BITS1

 //	{	// input:a, b, c, dst_d;

     sub	$0.8h, $0.8h, $1.8h			//a-b

     sshr	$0.8h, $0.8h, #2			//(a-b)/4

@@ -144,7 +144,7 @@

 //	}

 .endm

-.macro	FILTER_3_IN_16BITS_TO_8BITS2

+.macro FILTER_3_IN_16BITS_TO_8BITS2

 //	{	// input:a, b, c, dst_d;

     sub	$0.8h, $0.8h, $1.8h			//a-b

     sshr	$0.8h, $0.8h, #2			//(a-b)/4

@@ -156,7 +156,7 @@

 //	}

 .endm

-.macro	UNPACK_2_16BITS_TO_ABC

+.macro UNPACK_2_16BITS_TO_ABC

 //	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;

     ext	$4.16b, $0.16b, $1.16b, #4		//src[0]

     ext	$3.16b, $0.16b, $1.16b, #6		//src[1]

@@ -171,7 +171,7 @@

 //	}

 .endm

-.macro	AVERAGE_TWO_8BITS1

+.macro AVERAGE_TWO_8BITS1

 //	{	// input:dst_d, src_d A and B; working: v5

     uaddl	v30.8h, $2.8b, $1.8b

     rshrn	$0.8b, v30.8h, #1

@@ -178,7 +178,7 @@

 //	}

 .endm

-.macro	AVERAGE_TWO_8BITS2

+.macro AVERAGE_TWO_8BITS2

 //	{	// input:dst_d, src_d A and B; working: v5

     uaddl2	v30.8h, $2.16b, $1.16b

     rshrn2	$0.16b, v30.8h, #1

@@ -185,7 +185,7 @@

 //	}

 .endm

-.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used

+.macro FILTER_SINGLE_TAG_8BITS		// when width=17/9, used

 //	{	// input: src_d{Y[0][1][2][3][4][5]X},

     rev64	$2.8b, $0.8b				// X[5][4][3][2][1][0]O

     uaddl	$2.8h, $0.8b, $2.8b			// each 16bits, *[50][41][32][23][14][05]*

@@ -195,7 +195,7 @@

 //	}

 .endm

-.macro	UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23

+.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23

 //	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)

     ext.16b $3, $1, $1, #14       // X[0][1][2][3][4][5]O

     ext.16b $4, $3, $3, #8      // [3][4][5]OX[0][1][2]

@@ -211,7 +211,7 @@

 .endm

 #else

-.macro	FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]

     uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]

@@ -222,7 +222,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]

     uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]

@@ -233,7 +233,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]

     uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]

@@ -246,7 +246,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]

     uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]

@@ -259,7 +259,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]

     uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]

@@ -272,7 +272,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19

     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]

     uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]

@@ -285,7 +285,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31

     uaddl	\arg6\().8h, \arg0\().8b, \arg5\().8b		//dst_q=src[-2]+src[3]

     uaddl	v31.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]

@@ -295,7 +295,7 @@

 //	}

 .endm

-.macro	FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31

     uaddl2	\arg6\().8h, \arg0\().16b, \arg5\().16b		//dst_q=src[-2]+src[3]

     uaddl2	v31.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]

@@ -305,7 +305,7 @@

 //	}

 .endm

-.macro	FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3

+.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3

 //	{	// input:a, b, c, dst_d;

     sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//a-b

     sshr	\arg0\().8h, \arg0\().8h, #2			//(a-b)/4

@@ -317,7 +317,7 @@

 //	}

 .endm

-.macro	FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3

+.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3

 //	{	// input:a, b, c, dst_d;

     sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//a-b

     sshr	\arg0\().8h, \arg0\().8h, #2			//(a-b)/4

@@ -329,7 +329,7 @@

 //	}

 .endm

-.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4

+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4

 //	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;

     ext	\arg4\().16b, \arg0\().16b, \arg1\().16b, #4		//src[0]

     ext	\arg3\().16b, \arg0\().16b, \arg1\().16b, #6		//src[1]

@@ -344,7 +344,7 @@

 //	}

 .endm

-.macro	AVERAGE_TWO_8BITS1 arg0, arg1, arg2

+.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2

 //	{	// input:dst_d, src_d A and B; working: v5

     uaddl	v30.8h, \arg2\().8b, \arg1\().8b

     rshrn	\arg0\().8b, v30.8h, #1

@@ -351,7 +351,7 @@

 //	}

 .endm

-.macro	AVERAGE_TWO_8BITS2 arg0, arg1, arg2

+.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2

 //	{	// input:dst_d, src_d A and B; working: v5

     uaddl2	v30.8h, \arg2\().16b, \arg1\().16b

     rshrn2	\arg0\().16b, v30.8h, #1

@@ -358,7 +358,7 @@

 //	}

 .endm

-.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3

+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3

 // when width=17/9, used

 //	{	// input: src_d{Y[0][1][2][3][4][5]X},

     rev64	\arg2\().8b, \arg0\().8b				// X[5][4][3][2][1][0]O

@@ -369,7 +369,7 @@

 //	}

 .endm

-.macro	UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5

+.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5

 //	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)

     ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14       // X[0][1][2][3][4][5]O

     ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8      // [3][4][5]OX[0][1][2]

--- a/codec/common/x86/asm_inc.asm

+++ b/codec/common/x86/asm_inc.asm

@@ -577,7 +577,7 @@

 %endmacro

 ;all 0 for xmm and mm

-%macro	WELS_Zero 1

+%macro WELS_Zero 1

 	pxor %1, %1

 %endmacro

--- a/codec/decoder/core/arm/block_add_neon.S

+++ b/codec/decoder/core/arm/block_add_neon.S

@@ -35,7 +35,7 @@

 #include "arm_arch_common_macro.S"

 #ifdef __APPLE__

-.macro	ROW_TRANSFORM_1_STEP

+.macro ROW_TRANSFORM_1_STEP

 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9

     vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];

     vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];

@@ -46,7 +46,7 @@

 //	}

 .endm

-.macro	TRANSFORM_4BYTES	// both row & col transform used

+.macro TRANSFORM_4BYTES	// both row & col transform used

 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];

     vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];

     vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];

@@ -55,7 +55,7 @@

 //	}

 .endm

-.macro	COL_TRANSFORM_1_STEP

+.macro COL_TRANSFORM_1_STEP

 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];

     vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];

     vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];

@@ -68,7 +68,7 @@

 #else

-.macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9

     vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];

     vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];

@@ -79,7 +79,7 @@

 //	}

 .endm

-.macro	TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used

+.macro TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used

 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];

     vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];

     vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];

@@ -88,7 +88,7 @@

 //	}

 .endm

-.macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];

     vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];

     vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];

--- a/codec/decoder/core/x86/intra_pred.asm

+++ b/codec/decoder/core/x86/intra_pred.asm

@@ -97,7 +97,7 @@

 %endmacro

-%macro	LOAD_COLUMN 6

+%macro LOAD_COLUMN 6

 		movd	%1,	[%5]

 		movd	%2,	[%5+%6]

 		punpcklbw %1,	%2

@@ -143,7 +143,7 @@

 		pshufd		%2,	%2, 0

 %endmacro

-%macro	LOAD_COLUMN_C 6

+%macro LOAD_COLUMN_C 6

 		movd	%1,	[%5]

 		movd	%2,	[%5+%6]

 		punpcklbw %1,%2

--- a/codec/encoder/core/arm/reconstruct_neon.S

+++ b/codec/encoder/core/arm/reconstruct_neon.S

@@ -35,7 +35,7 @@

 #include "arm_arch_common_macro.S"

 #ifdef __APPLE__

-.macro	LOAD_4x4_DATA_FOR_DCT

+.macro LOAD_4x4_DATA_FOR_DCT

 //	{	//	input: $0~$3, src1*, src1_stride, src2*, src2_stride

     vld2.16	{$0[0],$1[0]}, [$4], $5

     vld2.16	{$2[0],$3[0]}, [$6], $7

@@ -49,7 +49,7 @@

 //	}

 .endm

-.macro	LOAD_8x8_DATA_FOR_DCT

+.macro LOAD_8x8_DATA_FOR_DCT

 //	{	//	input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride

     vld1.64	{$0}, [$8], r2

     vld1.64	{$4}, [$9], r4

@@ -63,7 +63,7 @@

 //	}

 .endm

-.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS

+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS

 //	{	//	input: src_d[0]~[3], working: [4]~[7]

     vadd.s16		$4, $0, $3			//int16 s[0] = data[i] + data[i3];

     vsub.s16		$7, $0, $3			//int16 s[3] = data[i] - data[i3];

@@ -79,7 +79,7 @@

 //	}

 .endm

-.macro	MATRIX_TRANSFORM_EACH_16BITS

+.macro MATRIX_TRANSFORM_EACH_16BITS

 //	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]

     vtrn.s16		$0, $1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]

     vtrn.s16		$2, $3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]

@@ -88,7 +88,7 @@

 //	}

 .endm

-.macro	NEWQUANT_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;

+.macro NEWQUANT_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;

 //	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1

     veor.s16		$6, $6			// init 0 , and keep 0;

     vaba.s16		$1, $0, $6		// f + abs(coef - 0)

@@ -106,7 +106,7 @@

 //	}

 .endm

-.macro	NEWQUANT_COEF_EACH_16BITS_MAX	// if coef <= 0, - coef; else , coef;

+.macro NEWQUANT_COEF_EACH_16BITS_MAX	// if coef <= 0, - coef; else , coef;

 //	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1

     veor.s16		$6, $6			// init 0 , and keep 0;

     vaba.s16		$1, $0, $6		// f + abs(coef - 0)

@@ -125,7 +125,7 @@

 //	}

 .endm

-.macro	QUANT_DUALWORD_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;

+.macro QUANT_DUALWORD_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;

 //	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q

     vaba.s16		$1, $0, $3		// f + abs(coef - 0)

     vmull.s16		$4, $1, $2		// *= mf

@@ -139,7 +139,7 @@

 //	}

 .endm

-.macro	DC_ZERO_COUNT_IN_DUALWORD

+.macro DC_ZERO_COUNT_IN_DUALWORD

 //	{	//	input:	coef, dst_d, working_d (all 0x01)

     vceq.s16	$1, $0, #0

     vand.s16	$1, $2

@@ -148,7 +148,7 @@

 //	}

 .endm

-.macro	SELECT_MAX_IN_ABS_COEF

+.macro SELECT_MAX_IN_ABS_COEF

 //	{	//	input:	coef_0, coef_1, max_q (identy to follow two)

     vmax.s16		$2, $0, $1		// max 1st in $3 & max 2nd in $4

     vpmax.s16		$3, $3, $4		// max 1st in $3[0][1] & max 2nd in $3[2][3]

@@ -156,7 +156,7 @@

 //	}

 .endm

-.macro	ZERO_COUNT_IN_2_QUARWORD

+.macro ZERO_COUNT_IN_2_QUARWORD

 //	{	//	input:	coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q

     vceq.s16	$0, #0

     vceq.s16	$1, #0

@@ -171,7 +171,7 @@

 //	}

 .endm

-.macro	HDM_QUANT_2x2_TOTAL_16BITS

+.macro HDM_QUANT_2x2_TOTAL_16BITS

 //	{	//	input: src_d[0]~[3], working_d, dst_d

     vshr.s64	$1, $0, #32

     vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];

@@ -181,7 +181,7 @@

 //	}

 .endm

-.macro	IHDM_4x4_TOTAL_16BITS

+.macro IHDM_4x4_TOTAL_16BITS

 //	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2

     vshr.s64	$1, $0, #32

     vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];

@@ -198,7 +198,7 @@

 //	}

 .endm

-.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP

+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP

 //	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;

     vmovl.u8		$4,$0

     vmovl.u8		$5,$1

@@ -209,7 +209,7 @@

 //	}

 .endm

-.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS

+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS

 //	{	//	input: src_d[0]~[3], output: e_d[0]~[3];

     vadd.s16		$4, $0, $2			//int16 e[i][0] = src[0] + src[2];

     vsub.s16		$5, $0, $2			//int16 e[i][1] = src[0] - src[2];

@@ -220,7 +220,7 @@

 //	}

 .endm

-.macro	TRANSFORM_TOTAL_16BITS	// both row & col transform used

+.macro TRANSFORM_TOTAL_16BITS	// both row & col transform used

 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];

     vadd.s16		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];

     vadd.s16		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];

@@ -230,7 +230,7 @@

 .endm

-.macro	ROW_TRANSFORM_0_STEP

+.macro ROW_TRANSFORM_0_STEP

 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3];

     vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];

     vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];

@@ -239,7 +239,7 @@

 //	}

 .endm

-.macro	ROW_TRANSFORM_1_STEP

+.macro ROW_TRANSFORM_1_STEP

 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9

     vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];

     vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];

@@ -250,7 +250,7 @@

 //	}

 .endm

-.macro	TRANSFORM_4BYTES	// both row & col transform used

+.macro TRANSFORM_4BYTES	// both row & col transform used

 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];

     vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];

     vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];

@@ -259,7 +259,7 @@

 //	}

 .endm

-.macro	COL_TRANSFORM_0_STEP

+.macro COL_TRANSFORM_0_STEP

 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];

     vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];

     vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];

@@ -268,7 +268,7 @@

 //	}

 .endm

-.macro	COL_TRANSFORM_1_STEP

+.macro COL_TRANSFORM_1_STEP

 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];

     vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];

     vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];

@@ -279,7 +279,7 @@

 //	}

 .endm

 #else

-.macro	LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

 //	{	//	input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride

     vld2.16	{\arg0[0],\arg1[0]}, [\arg4], \arg5

     vld2.16	{\arg2[0],\arg3[0]}, [\arg6], \arg7

@@ -293,7 +293,7 @@

 //	}

 .endm

-.macro	LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

 //	{	//	input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride

     vld1.64	{\arg0}, [\arg8], r2

     vld1.64	{\arg4}, [\arg9], r4

@@ -307,7 +307,7 @@

 //	}

 .endm

-.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

 //	{	//	input: src_d[0]~[3], working: [4]~[7]

     vadd.s16		\arg4, \arg0, \arg3			//int16 s[0] = data[i] + data[i3];

     vsub.s16		\arg7, \arg0, \arg3			//int16 s[3] = data[i] - data[i3];

@@ -323,7 +323,7 @@

 //	}

 .endm

-.macro	MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3

+.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3

 //	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]

     vtrn.s16		\arg0, \arg1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]

     vtrn.s16		\arg2, \arg3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]

@@ -332,7 +332,7 @@

 //	}

 .endm

-.macro	NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

 //	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1

     veor.s16		\arg6, \arg6			// init 0 , and keep 0;

     vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)

@@ -350,7 +350,7 @@

 //	}

 .endm

-.macro	NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

 //	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1

     veor.s16		\arg6, \arg6			// init 0 , and keep 0;

     vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)

@@ -369,7 +369,7 @@

 //	}

 .endm

-.macro	QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4

+.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4

 //	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q

     vaba.s16		\arg1, \arg0, \arg3		// f + abs(coef - 0)

     vmull.s16		\arg4, \arg1, \arg2		// *= mf

@@ -383,7 +383,7 @@

 //	}

 .endm

-.macro	DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2

+.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2

 //	{	//	input:	coef, dst_d, working_d (all 0x01)

     vceq.s16	\arg1, \arg0, #0

     vand.s16	\arg1, \arg2

@@ -392,7 +392,7 @@

 //	}

 .endm

-.macro	SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4

+.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4

 //	{	//	input:	coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1

     vmax.s16		\arg2, \arg0, \arg1		// max 1st in \arg3 & max 2nd in \arg4

     vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]

@@ -400,7 +400,7 @@

 //	}

 .endm

-.macro	ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6

+.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6

 //	{	//	input:	coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q

     vceq.s16	\arg0, #0

     vceq.s16	\arg1, #0

@@ -415,7 +415,7 @@

 //	}

 .endm

-.macro	HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2

+.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2

 //	{	//	input: src_d[0]~[3], working_d, dst_d

     vshr.s64	\arg1, \arg0, #32

     vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];

@@ -425,7 +425,7 @@

 //	}

 .endm

-.macro	IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2

+.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2

 //	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2

     vshr.s64	\arg1, \arg0, #32

     vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];

@@ -442,7 +442,7 @@

 //	}

 .endm

-.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5

+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5

 //	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;

     vmovl.u8		\arg4,\arg0

     vmovl.u8		\arg5,\arg1

@@ -453,7 +453,7 @@

 //	}

 .endm

-.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

 //	{	//	input: src_d[0]~[3], output: e_d[0]~[3];

     vadd.s16		\arg4, \arg0, \arg2			//int16 e[i][0] = src[0] + src[2];

     vsub.s16		\arg5, \arg0, \arg2			//int16 e[i][1] = src[0] - src[2];

@@ -464,7 +464,7 @@

 //	}

 .endm

-.macro	TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used

+.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used

 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];

     vadd.s16		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];

     vadd.s16		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];

@@ -474,7 +474,7 @@

 .endm

-.macro	ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3];

     vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];

     vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];

@@ -483,7 +483,7 @@

 //	}

 .endm

-.macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9

     vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];

     vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];

@@ -494,7 +494,7 @@

 //	}

 .endm

-.macro	TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used

+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used

 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];

     vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];

     vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];

@@ -503,7 +503,7 @@

 //	}

 .endm

-.macro	COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];

     vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];

     vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];

@@ -512,7 +512,7 @@

 //	}

 .endm

-.macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];

     vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];

     vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];

--- a/codec/encoder/core/x86/dct.asm

+++ b/codec/encoder/core/x86/dct.asm

@@ -454,7 +454,7 @@

 	movdqa	%4, %1

 	psubd	%4, %2

 %endmacro

-%macro		SSE2_Load4Col	5

+%macro SSE2_Load4Col	5

 	movsx		r2,		WORD[%5]

  	movd		%1,			r2d

  	movsx		r2,		WORD[%5 + 0x20]

--- a/codec/encoder/core/x86/intra_pred.asm

+++ b/codec/encoder/core/x86/intra_pred.asm

@@ -108,7 +108,7 @@

 	paddusw     %1, %2

 %endmacro

-%macro	LOAD_COLUMN 6

+%macro LOAD_COLUMN 6

 		movd	%1,	[%5]

 		movd	%2,	[%5+%6]

 		punpcklbw %1,	%2

@@ -155,7 +155,7 @@

 		pshufd		%2,	%2, 0

 %endmacro

-%macro	LOAD_COLUMN_C 6

+%macro LOAD_COLUMN_C 6

 		movd	%1,	[%5]

 		movd	%2,	[%5+%6]

 		punpcklbw %1,%2

--- a/codec/processing/src/x86/denoisefilter.asm

+++ b/codec/processing/src/x86/denoisefilter.asm

@@ -56,7 +56,7 @@

 ;***********************************************************************

 SECTION .text

-%macro	WEIGHT_LINE	9

+%macro WEIGHT_LINE	9

 		movq		%2,	%9

 		punpcklbw	%2,	%7

 		movdqa		%8,	%2

@@ -76,7 +76,7 @@

 		paddusw		%5,	%2

 %endmacro

-%macro	WEIGHT_LINE1_UV	4

+%macro WEIGHT_LINE1_UV	4

 		movdqa		%2,	%1

 		punpcklbw	%2,	%4

 		paddw		%3,	%2

@@ -103,7 +103,7 @@

 		paddw		%3,	%2

 %endmacro

-%macro	WEIGHT_LINE2_UV	4

+%macro WEIGHT_LINE2_UV	4

 		movdqa		%2,	%1

 		punpcklbw	%2,	%4

 		paddw		%3,	%2

@@ -132,7 +132,7 @@

 		paddw		%3,	%2

 %endmacro

-%macro	WEIGHT_LINE3_UV	4

+%macro WEIGHT_LINE3_UV	4

 		movdqa		%2,	%1

 		punpcklbw	%2,	%4

 		psllw		%2,	1